summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/kernel/sys_dp264.c8
-rw-r--r--arch/alpha/kernel/sys_titan.c4
-rw-r--r--arch/arm/common/gic.c4
-rw-r--r--arch/cris/arch-v32/kernel/irq.c4
-rw-r--r--arch/ia64/hp/sim/hpsim_irq.c3
-rw-r--r--arch/ia64/kernel/acpi.c5
-rw-r--r--arch/ia64/kernel/iosapic.c10
-rw-r--r--arch/ia64/kernel/msi_ia64.c16
-rw-r--r--arch/ia64/sn/kernel/irq.c4
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c8
-rw-r--r--arch/mips/cavium-octeon/octeon-irq.c8
-rw-r--r--arch/mips/include/asm/irq.h2
-rw-r--r--arch/mips/kernel/irq-gic.c5
-rw-r--r--arch/mips/mti-malta/malta-smtc.c4
-rw-r--r--arch/mips/sibyte/bcm1480/irq.c8
-rw-r--r--arch/mips/sibyte/sb1250/irq.c8
-rw-r--r--arch/parisc/kernel/irq.c6
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h98
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile3
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c1263
-rw-r--r--arch/powerpc/kernel/power4-pmu.c598
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c671
-rw-r--r--arch/powerpc/kernel/power5-pmu.c611
-rw-r--r--arch/powerpc/kernel/power6-pmu.c532
-rw-r--r--arch/powerpc/kernel/power7-pmu.c357
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c482
-rw-r--r--arch/powerpc/mm/fault.c10
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/powerpc/platforms/pseries/xics.c12
-rw-r--r--arch/powerpc/sysdev/mpic.c4
-rw-r--r--arch/powerpc/sysdev/mpic.h2
-rw-r--r--arch/sparc/include/asm/thread_info_64.h4
-rw-r--r--arch/sparc/kernel/irq_64.c12
-rw-r--r--arch/x86/Kbuild16
-rw-r--r--arch/x86/Kconfig54
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile19
-rw-r--r--arch/x86/boot/.gitignore2
-rw-r--r--arch/x86/boot/Makefile29
-rw-r--r--arch/x86/boot/a20.c9
-rw-r--r--arch/x86/boot/apm.c76
-rw-r--r--arch/x86/boot/bioscall.S82
-rw-r--r--arch/x86/boot/boot.h48
-rw-r--r--arch/x86/boot/compressed/.gitignore3
-rw-r--r--arch/x86/boot/compressed/Makefile54
-rw-r--r--arch/x86/boot/compressed/head_32.S194
-rw-r--r--arch/x86/boot/compressed/head_64.S169
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c97
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S (renamed from arch/x86/boot/compressed/vmlinux_64.lds)29
-rw-r--r--arch/x86/boot/compressed/vmlinux.scr10
-rw-r--r--arch/x86/boot/compressed/vmlinux_32.lds43
-rw-r--r--arch/x86/boot/edd.c71
-rw-r--r--arch/x86/boot/header.S30
-rw-r--r--arch/x86/boot/main.c39
-rw-r--r--arch/x86/boot/mca.c27
-rw-r--r--arch/x86/boot/memory.c79
-rw-r--r--arch/x86/boot/regs.c29
-rw-r--r--arch/x86/boot/setup.ld6
-rw-r--r--arch/x86/boot/tty.c52
-rw-r--r--arch/x86/boot/video-bios.c27
-rw-r--r--arch/x86/boot/video-vesa.c137
-rw-r--r--arch/x86/boot/video-vga.c95
-rw-r--r--arch/x86/boot/video.c42
-rw-r--r--arch/x86/boot/video.h14
-rw-r--r--arch/x86/configs/i386_defconfig148
-rw-r--r--arch/x86/configs/x86_64_defconfig151
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/alternative.h59
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h55
-rw-r--r--arch/x86/include/asm/apic.h33
-rw-r--r--arch/x86/include/asm/apicdef.h8
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/boot.h15
-rw-r--r--arch/x86/include/asm/bootparam.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h101
-rw-r--r--arch/x86/include/asm/cpufeature.h7
-rw-r--r--arch/x86/include/asm/ds.h82
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h27
-rw-r--r--arch/x86/include/asm/i387.h43
-rw-r--r--arch/x86/include/asm/i8259.h4
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/iomap.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h9
-rw-r--r--arch/x86/include/asm/k8.h13
-rw-r--r--arch/x86/include/asm/microcode.h25
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/nmi.h2
-rw-r--r--arch/x86/include/asm/numa_64.h10
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_64_types.h22
-rw-r--r--arch/x86/include/asm/page_types.h6
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h6
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/processor.h47
-rw-r--r--arch/x86/include/asm/ptrace.h9
-rw-r--r--arch/x86/include/asm/required-features.h8
-rw-r--r--arch/x86/include/asm/setup.h1
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/sparsemem.h2
-rw-r--r--arch/x86/include/asm/syscalls.h45
-rw-r--r--arch/x86/include/asm/thread_info.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h8
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--arch/x86/include/asm/traps.h5
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile2
-rw-r--r--arch/x86/kernel/acpi/realmode/bioscall.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/regs.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c500
-rw-r--r--arch/x86/kernel/amd_iommu_init.c273
-rw-r--r--arch/x86/kernel/apic/apic.c314
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c4
-rw-r--r--arch/x86/kernel/apic/es7000_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c902
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/probe_64.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c20
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c25
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c431
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c6
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c153
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c24
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h15
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c6
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1704
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/ds.c921
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.h1
-rw-r--r--arch/x86/kernel/e820.c46
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/entry_64.S29
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/irq.c30
-rw-r--r--arch/x86/kernel/irqinit.c (renamed from arch/x86/kernel/irqinit_32.c)152
-rw-r--r--arch/x86/kernel/irqinit_64.c177
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c70
-rw-r--r--arch/x86/kernel/microcode_core.c329
-rw-r--r--arch/x86/kernel/microcode_intel.c90
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/paravirt.c56
-rw-r--r--arch/x86/kernel/pci-calgary_64.c54
-rw-r--r--arch/x86/kernel/pci-gart_64.c55
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c20
-rw-r--r--arch/x86/kernel/process_64.c20
-rw-r--r--arch/x86/kernel/ptrace.c284
-rw-r--r--arch/x86/kernel/quirks.c37
-rw-r--r--arch/x86/kernel/reboot.c9
-rw-r--r--arch/x86/kernel/setup.c40
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/smp.c20
-rw-r--r--arch/x86/kernel/smpboot.c22
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_uv.c15
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/vm86_32.c13
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmlinux.lds.S430
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S229
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S298
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/mm/dump_pagetables.c7
-rw-r--r--arch/x86/mm/fault.c69
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c78
-rw-r--r--arch/x86/mm/init_32.c61
-rw-r--r--arch/x86/mm/init_64.c47
-rw-r--r--arch/x86/mm/iomap_32.c1
-rw-r--r--arch/x86/mm/kmmio.c104
-rw-r--r--arch/x86/mm/memtest.c14
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa_64.c33
-rw-r--r--arch/x86/mm/pageattr.c14
-rw-r--r--arch/x86/mm/srat_64.c98
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c8
-rw-r--r--arch/x86/oprofile/op_x86_model.h2
-rw-r--r--arch/x86/pci/irq.c84
-rw-r--r--arch/x86/vdso/vdso32-setup.c6
-rw-r--r--arch/x86/vdso/vma.c8
-rw-r--r--arch/x86/xen/enlighten.c65
-rw-r--r--arch/x86/xen/mmu.c23
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/xen-ops.h1
230 files changed, 12590 insertions, 4676 deletions
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 9c9d1fd4155..5bd5259324b 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
}
}
-static void
+static int
dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
{
spin_lock(&dp264_irq_lock);
cpu_set_irq_affinity(irq, *affinity);
tsunami_update_irq_hw(cached_irq_mask);
spin_unlock(&dp264_irq_lock);
+
+ return 0;
}
-static void
+static int
clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
{
spin_lock(&dp264_irq_lock);
cpu_set_irq_affinity(irq - 16, *affinity);
tsunami_update_irq_hw(cached_irq_mask);
spin_unlock(&dp264_irq_lock);
+
+ return 0;
}
static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 27f840a4ad3..8dd239ebdb9 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
}
-static void
+static int
titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
{
spin_lock(&titan_irq_lock);
titan_cpu_set_irq_affinity(irq - 16, *affinity);
titan_update_irq_hw(titan_cached_irq_mask);
spin_unlock(&titan_irq_lock);
+
+ return 0;
}
static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index 3e1714c6523..664c7b8b1ba 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
}
#ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
{
void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
val |= 1 << (cpu + shift);
writel(val, reg);
spin_unlock(&irq_controller_lock);
+
+ return 0;
}
#endif
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index df3925cb1c7..d70b445f4a8 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
{
}
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
{
unsigned long flags;
spin_lock_irqsave(&irq_lock, flags);
irq_allocations[irq - FIRST_IRQ].mask = *dest;
spin_unlock_irqrestore(&irq_lock, flags);
+
+ return 0;
}
static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index cc0a3182db3..acb5047ab57 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
{
}
-static void
+static int
hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
{
+ return 0;
}
static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5510317db37..baec6f00f7f 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
* success: return IRQ number (>=0)
* failure: return < 0
*/
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
{
if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
fadt = (struct acpi_table_fadt *)fadt_header;
- acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+ acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+ ACPI_ACTIVE_LOW);
return 0;
}
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 166e0d839fa..f92cef47bf8 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
}
-static void
+static int
iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
{
#ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
cpu = cpumask_first_and(cpu_online_mask, mask);
if (cpu >= nr_cpu_ids)
- return;
+ return -1;
if (irq_prepare_move(irq, cpu))
- return;
+ return -1;
dest = cpu_physical_id(cpu);
if (!iosapic_intr_info[irq].count)
- return; /* not an IOSAPIC interrupt */
+ return -1; /* not an IOSAPIC interrupt */
set_irq_affinity_info(irq, dest, redir);
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
}
+
#endif
+ return 0;
}
/*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 2b15e233f7f..0f8ade9331b 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
static struct irq_chip ia64_msi_chip;
#ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
const cpumask_t *cpu_mask)
{
struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
int cpu = first_cpu(*cpu_mask);
if (!cpu_online(cpu))
- return;
+ return -1;
if (irq_prepare_move(irq, cpu))
- return;
+ return -1;
read_msi_msg(irq, &msg);
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
write_msi_msg(irq, &msg);
cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
#ifdef CONFIG_DMAR
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_cfg *cfg = irq_cfg + irq;
struct msi_msg msg;
int cpu = cpumask_first(mask);
if (!cpu_online(cpu))
- return;
+ return -1;
if (irq_prepare_move(irq, cpu))
- return;
+ return -1;
dmar_msi_read(irq, &msg);
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dmar_msi_write(irq, &msg);
cpumask_copy(irq_desc[irq].affinity, mask);
+
+ return 0;
}
#endif /* CONFIG_SMP */
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 66fd705e82c..764f26abac0 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ finish_up:
return new_irq_info;
}
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
{
struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
sn_irq_lh[irq], list)
(void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+ return 0;
}
#ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 81e428943d7..fbbfb970120 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
}
#ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
const struct cpumask *cpu_mask)
{
struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
cpu = cpumask_first(cpu_mask);
sn_irq_info = sn_msi_info[irq].sn_irq_info;
if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
- return;
+ return -1;
/*
* Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
sn_msi_info[irq].sn_irq_info = new_irq_info;
if (new_irq_info == NULL)
- return;
+ return -1;
/*
* Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
write_msi_msg(irq, &msg);
cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+ return 0;
}
#endif /* CONFIG_SMP */
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 1c19af8daa6..d3a0c8154be 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
}
#ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
{
int cpu;
int bit = irq - OCTEON_IRQ_WORKQ0; /* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
*/
cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
write_unlock(&octeon_irq_ciu0_rwlock);
+
+ return 0;
}
#endif
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
}
#ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
{
int cpu;
int bit = irq - OCTEON_IRQ_WDOG0; /* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
*/
cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
write_unlock(&octeon_irq_ciu1_rwlock);
+
+ return 0;
}
#endif
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 3214ade02d1..4f1eed107b0 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
#ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
#include <linux/cpumask.h>
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
const struct cpumask *affinity);
extern void smtc_forward_irq(unsigned int irq);
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 87deb8f6c45..3f43c2e3aa5 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
static DEFINE_SPINLOCK(gic_lock);
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
cpumask_t tmp = CPU_MASK_NONE;
unsigned long flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
cpumask_and(&tmp, cpumask, cpu_online_mask);
if (cpus_empty(tmp))
- return;
+ return -1;
/* Assumption : cpumask refers to a single CPU */
spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
cpumask_copy(irq_desc[irq].affinity, cpumask);
spin_unlock_irqrestore(&gic_lock, flags);
+ return 0;
}
#endif
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 5ba31888fef..499ffe5475d 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
*/
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
{
cpumask_t tmask;
int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
/* Do any generic SMTC IRQ affinity setup */
smtc_set_irq_affinity(irq, tmask);
+
+ return 0;
}
#endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index c147c4b35d3..690de06bde9 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
static void disable_bcm1480_irq(unsigned int irq);
static void ack_bcm1480_irq(unsigned int irq);
#ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
#endif
#ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
}
#ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
{
int i = 0, old_cpu, cpu, int_on, k;
u64 cur_ints;
@@ -118,7 +118,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
if (cpumask_weight(mask) != 1) {
printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
- return;
+ return -1;
}
i = cpumask_first(mask);
@@ -152,6 +152,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
}
}
spin_unlock_irqrestore(&bcm1480_imr_lock, flags);
+
+ return 0;
}
#endif
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index 38cb998ade2..409dec79886 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
static void disable_sb1250_irq(unsigned int irq);
static void ack_sb1250_irq(unsigned int irq);
#ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
#endif
#ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
}
#ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
{
int i = 0, old_cpu, cpu, int_on;
u64 cur_ints;
@@ -113,7 +113,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
if (cpumask_weight(mask) > 1) {
printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
- return;
+ return -1;
}
/* Convert logical CPU to physical CPU */
@@ -143,6 +143,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
R_IMR_INTERRUPT_MASK));
}
spin_unlock_irqrestore(&sb1250_imr_lock, flags);
+
+ return 0;
}
#endif
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4ea4229d765..8007f1e6572 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
return cpu_dest;
}
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
{
int cpu_dest;
cpu_dest = cpu_check_affinity(irq, dest);
if (cpu_dest < 0)
- return;
+ return -1;
cpumask_copy(&irq_desc[irq].affinity, dest);
+
+ return 0;
}
#endif
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6d..20a44d0c9fd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
*/
struct irq_chip;
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long test_perf_counter_pending(void)
+{
+ unsigned long x;
+
+ asm volatile("lbz %0,%1(13)"
+ : "=r" (x)
+ : "i" (offsetof(struct paca_struct, perf_counter_pending)));
+ return x;
+}
+
+static inline void set_perf_counter_pending(void)
+{
+ asm volatile("stb %0,%1(13)" : :
+ "r" (1),
+ "i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+ asm volatile("stb %0,%1(13)" : :
+ "r" (0),
+ "i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long test_perf_counter_pending(void)
+{
+ return 0;
+}
+
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf14..6ef05572301 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
u8 soft_enabled; /* irq soft-enable flag */
u8 hard_enabled; /* set if irqs are enabled in MSR */
u8 io_sync; /* writel() needs spin_unlock sync */
+ u8 perf_counter_pending; /* PM interrupt while soft-disabled */
/* Stuff for accurate time accounting */
u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 00000000000..cc7c887705b
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,98 @@
+/*
+ * Performance counter support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+
+#define MAX_HWCOUNTERS 8
+#define MAX_EVENT_ALTERNATIVES 8
+#define MAX_LIMITED_HWCOUNTERS 2
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+ int n_counter;
+ int max_alternatives;
+ u64 add_fields;
+ u64 test_adder;
+ int (*compute_mmcr)(u64 events[], int n_ev,
+ unsigned int hwc[], u64 mmcr[]);
+ int (*get_constraint)(u64 event, u64 *mskp, u64 *valp);
+ int (*get_alternatives)(u64 event, unsigned int flags,
+ u64 alt[]);
+ void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+ int (*limited_pmc_event)(u64 event);
+ u32 flags;
+ int n_generic;
+ int *generic_events;
+ int (*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+};
+
+extern struct power_pmu *ppmu;
+
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */
+
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */
+
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs) perf_misc_flags(regs)
+
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
+/*
+ * The power_pmu.get_constraint function returns a 64-bit value and
+ * a 64-bit mask that express the constraints between this event and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event. For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits. The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class. A field of k bits can be used for
+ * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set. In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play. The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes. (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.) For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits. The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event's class set. The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d540e8..fb359b0a693 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */
#define SPRN_MMCR1 798
#define SPRN_MMCRA 0x312
+#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
#define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */
#define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */
#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */
#define MMCRA_SLOT_SHIFT 24
#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
+#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */
#define POWER6_MMCRA_SIHV 0x0000040000000000ULL
#define POWER6_MMCRA_SIPR 0x0000020000000000ULL
#define POWER6_MMCRA_THRM 0x00000020UL
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index d98a30dfd41..a0b92de51c7 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
SYSCALL_SPU(dup3)
SYSCALL_SPU(pipe2)
SYSCALL(inotify_init1)
-SYSCALL(ni_syscall)
+SYSCALL_SPU(perf_counter_open)
COMPAT_SYS_SPU(preadv)
COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3f06f8ec81c..4badac2d11d 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
#define __NR_dup3 316
#define __NR_pipe2 317
#define __NR_inotify_init1 318
+#define __NR_perf_counter_open 319
#define __NR_preadv 320
#define __NR_pwritev 321
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a..a2c683403c2 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,9 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
+ power5-pmu.o power5+-pmu.o power6-pmu.o \
+ power7-pmu.o
obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc05394..e981d1ce191 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+ DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc3233047..43e073477c3 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
2:
TRACE_AND_RESTORE_IRQ(r5);
+#ifdef CONFIG_PERF_COUNTERS
+ /* check paca->perf_counter_pending if we're enabling ints */
+ lbz r3,PACAPERFPEND(r13)
+ and. r3,r3,r5
+ beq 27f
+ bl .perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
/* extract EE bit and use it to restore paca->hard_enabled */
ld r3,_MSR(r1)
rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8c1a4966867..feff792ed0f 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
iseries_handle_interrupts();
}
+ if (test_perf_counter_pending()) {
+ clear_perf_counter_pending();
+ perf_counter_do_pending();
+ }
+
/*
* if (get_paca()->hard_enabled) return;
* But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 00000000000..bb202388170
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1263 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/ptrace.h>
+
+struct cpu_hw_counters {
+ int n_counters;
+ int n_percpu;
+ int disabled;
+ int n_added;
+ int n_limited;
+ u8 pmcs_enabled;
+ struct perf_counter *counter[MAX_HWCOUNTERS];
+ u64 events[MAX_HWCOUNTERS];
+ unsigned int flags[MAX_HWCOUNTERS];
+ u64 mmcr[3];
+ struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+ u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+struct power_pmu *ppmu;
+
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
+static void perf_counter_interrupt(struct pt_regs *regs);
+
+void perf_counter_print_debug(void)
+{
+}
+
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+ unsigned long val;
+
+ switch (idx) {
+ case 1:
+ val = mfspr(SPRN_PMC1);
+ break;
+ case 2:
+ val = mfspr(SPRN_PMC2);
+ break;
+ case 3:
+ val = mfspr(SPRN_PMC3);
+ break;
+ case 4:
+ val = mfspr(SPRN_PMC4);
+ break;
+ case 5:
+ val = mfspr(SPRN_PMC5);
+ break;
+ case 6:
+ val = mfspr(SPRN_PMC6);
+ break;
+ case 7:
+ val = mfspr(SPRN_PMC7);
+ break;
+ case 8:
+ val = mfspr(SPRN_PMC8);
+ break;
+ default:
+ printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+ val = 0;
+ }
+ return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+ switch (idx) {
+ case 1:
+ mtspr(SPRN_PMC1, val);
+ break;
+ case 2:
+ mtspr(SPRN_PMC2, val);
+ break;
+ case 3:
+ mtspr(SPRN_PMC3, val);
+ break;
+ case 4:
+ mtspr(SPRN_PMC4, val);
+ break;
+ case 5:
+ mtspr(SPRN_PMC5, val);
+ break;
+ case 6:
+ mtspr(SPRN_PMC6, val);
+ break;
+ case 7:
+ mtspr(SPRN_PMC7, val);
+ break;
+ case 8:
+ mtspr(SPRN_PMC8, val);
+ break;
+ default:
+ printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+ }
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(u64 event[], unsigned int cflags[],
+ int n_ev)
+{
+ u64 mask, value, nv;
+ u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+ u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+ u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+ u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+ int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+ int i, j;
+ u64 addf = ppmu->add_fields;
+ u64 tadd = ppmu->test_adder;
+
+ if (n_ev > ppmu->n_counter)
+ return -1;
+
+ /* First see if the events will go on as-is */
+ for (i = 0; i < n_ev; ++i) {
+ if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+ && !ppmu->limited_pmc_event(event[i])) {
+ ppmu->get_alternatives(event[i], cflags[i],
+ alternatives[i]);
+ event[i] = alternatives[i][0];
+ }
+ if (ppmu->get_constraint(event[i], &amasks[i][0],
+ &avalues[i][0]))
+ return -1;
+ }
+ value = mask = 0;
+ for (i = 0; i < n_ev; ++i) {
+ nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+ if ((((nv + tadd) ^ value) & mask) != 0 ||
+ (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+ break;
+ value = nv;
+ mask |= amasks[i][0];
+ }
+ if (i == n_ev)
+ return 0; /* all OK */
+
+ /* doesn't work, gather alternatives... */
+ if (!ppmu->get_alternatives)
+ return -1;
+ for (i = 0; i < n_ev; ++i) {
+ choice[i] = 0;
+ n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+ alternatives[i]);
+ for (j = 1; j < n_alt[i]; ++j)
+ ppmu->get_constraint(alternatives[i][j],
+ &amasks[i][j], &avalues[i][j]);
+ }
+
+ /* enumerate all possibilities and see if any will work */
+ i = 0;
+ j = -1;
+ value = mask = nv = 0;
+ while (i < n_ev) {
+ if (j >= 0) {
+ /* we're backtracking, restore context */
+ value = svalues[i];
+ mask = smasks[i];
+ j = choice[i];
+ }
+ /*
+ * See if any alternative k for event i,
+ * where k > j, will satisfy the constraints.
+ */
+ while (++j < n_alt[i]) {
+ nv = (value | avalues[i][j]) +
+ (value & avalues[i][j] & addf);
+ if ((((nv + tadd) ^ value) & mask) == 0 &&
+ (((nv + tadd) ^ avalues[i][j])
+ & amasks[i][j]) == 0)
+ break;
+ }
+ if (j >= n_alt[i]) {
+ /*
+ * No feasible alternative, backtrack
+ * to event i-1 and continue enumerating its
+ * alternatives from where we got up to.
+ */
+ if (--i < 0)
+ return -1;
+ } else {
+ /*
+ * Found a feasible alternative for event i,
+ * remember where we got up to with this event,
+ * go on to the next event, and start with
+ * the first alternative for it.
+ */
+ choice[i] = j;
+ svalues[i] = value;
+ smasks[i] = mask;
+ value = nv;
+ mask |= amasks[i][j];
+ ++i;
+ j = -1;
+ }
+ }
+
+ /* OK, we have a feasible combination, tell the caller the solution */
+ for (i = 0; i < n_ev; ++i)
+ event[i] = alternatives[i][choice[i]];
+ return 0;
+}
+
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+ int n_prev, int n_new)
+{
+ int eu = 0, ek = 0, eh = 0;
+ int i, n, first;
+ struct perf_counter *counter;
+
+ n = n_prev + n_new;
+ if (n <= 1)
+ return 0;
+
+ first = 1;
+ for (i = 0; i < n; ++i) {
+ if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+ cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+ continue;
+ }
+ counter = ctrs[i];
+ if (first) {
+ eu = counter->attr.exclude_user;
+ ek = counter->attr.exclude_kernel;
+ eh = counter->attr.exclude_hv;
+ first = 0;
+ } else if (counter->attr.exclude_user != eu ||
+ counter->attr.exclude_kernel != ek ||
+ counter->attr.exclude_hv != eh) {
+ return -EAGAIN;
+ }
+ }
+
+ if (eu || ek || eh)
+ for (i = 0; i < n; ++i)
+ if (cflags[i] & PPMU_LIMITED_PMC_OK)
+ cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
+ return 0;
+}
+
+static void power_pmu_read(struct perf_counter *counter)
+{
+ long val, delta, prev;
+
+ if (!counter->hw.idx)
+ return;
+ /*
+ * Performance monitor interrupts come even when interrupts
+ * are soft-disabled, as long as interrupts are hard-enabled.
+ * Therefore we treat them like NMIs.
+ */
+ do {
+ prev = atomic64_read(&counter->hw.prev_count);
+ barrier();
+ val = read_pmc(counter->hw.idx);
+ } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+
+ /* The counters are only 32 bits wide */
+ delta = (val - prev) & 0xfffffffful;
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &counter->hw.period_left);
+}
+
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts. This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+ return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+ && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+ unsigned long pmc5, unsigned long pmc6)
+{
+ struct perf_counter *counter;
+ u64 val, prev, delta;
+ int i;
+
+ for (i = 0; i < cpuhw->n_limited; ++i) {
+ counter = cpuhw->limited_counter[i];
+ if (!counter->hw.idx)
+ continue;
+ val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+ prev = atomic64_read(&counter->hw.prev_count);
+ counter->hw.idx = 0;
+ delta = (val - prev) & 0xfffffffful;
+ atomic64_add(delta, &counter->count);
+ }
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+ unsigned long pmc5, unsigned long pmc6)
+{
+ struct perf_counter *counter;
+ u64 val;
+ int i;
+
+ for (i = 0; i < cpuhw->n_limited; ++i) {
+ counter = cpuhw->limited_counter[i];
+ counter->hw.idx = cpuhw->limited_hwidx[i];
+ val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+ atomic64_set(&counter->hw.prev_count, val);
+ perf_counter_update_userpage(counter);
+ }
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters. We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+ unsigned long pmc5, pmc6;
+
+ if (!cpuhw->n_limited) {
+ mtspr(SPRN_MMCR0, mmcr0);
+ return;
+ }
+
+ /*
+ * Write MMCR0, then read PMC5 and PMC6 immediately.
+ * To ensure we don't get a performance monitor interrupt
+ * between writing MMCR0 and freezing/thawing the limited
+ * counters, we first write MMCR0 with the counter overflow
+ * interrupt enable bits turned off.
+ */
+ asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+ : "=&r" (pmc5), "=&r" (pmc6)
+ : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+ "i" (SPRN_MMCR0),
+ "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+ if (mmcr0 & MMCR0_FC)
+ freeze_limited_counters(cpuhw, pmc5, pmc6);
+ else
+ thaw_limited_counters(cpuhw, pmc5, pmc6);
+
+ /*
+ * Write the full MMCR0 including the counter overflow interrupt
+ * enable bits, if necessary.
+ */
+ if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+ mtspr(SPRN_MMCR0, mmcr0);
+}
+
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+void hw_perf_disable(void)
+{
+ struct cpu_hw_counters *cpuhw;
+ unsigned long ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ cpuhw = &__get_cpu_var(cpu_hw_counters);
+
+ ret = cpuhw->disabled;
+ if (!ret) {
+ cpuhw->disabled = 1;
+ cpuhw->n_added = 0;
+
+ /*
+ * Check if we ever enabled the PMU on this cpu.
+ */
+ if (!cpuhw->pmcs_enabled) {
+ if (ppc_md.enable_pmcs)
+ ppc_md.enable_pmcs();
+ cpuhw->pmcs_enabled = 1;
+ }
+
+ /*
+ * Disable instruction sampling if it was enabled
+ */
+ if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+ mtspr(SPRN_MMCRA,
+ cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+ mb();
+ }
+
+ /*
+ * Set the 'freeze counters' bit.
+ * The barrier is to make sure the mtspr has been
+ * executed and the PMU has frozen the counters
+ * before we return.
+ */
+ write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
+ mb();
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_enable(void)
+{
+ struct perf_counter *counter;
+ struct cpu_hw_counters *cpuhw;
+ unsigned long flags;
+ long i;
+ unsigned long val;
+ s64 left;
+ unsigned int hwc_index[MAX_HWCOUNTERS];
+ int n_lim;
+ int idx;
+
+ local_irq_save(flags);
+ cpuhw = &__get_cpu_var(cpu_hw_counters);
+ if (!cpuhw->disabled) {
+ local_irq_restore(flags);
+ return;
+ }
+ cpuhw->disabled = 0;
+
+ /*
+ * If we didn't change anything, or only removed counters,
+ * no need to recalculate MMCR* settings and reset the PMCs.
+ * Just reenable the PMU with the current MMCR* settings
+ * (possibly updated for removal of counters).
+ */
+ if (!cpuhw->n_added) {
+ mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+ mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+ if (cpuhw->n_counters == 0)
+ get_lppaca()->pmcregs_in_use = 0;
+ goto out_enable;
+ }
+
+ /*
+ * Compute MMCR* values for the new set of counters
+ */
+ if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+ cpuhw->mmcr)) {
+ /* shouldn't ever get here */
+ printk(KERN_ERR "oops compute_mmcr failed\n");
+ goto out;
+ }
+
+ /*
+ * Add in MMCR0 freeze bits corresponding to the
+ * attr.exclude_* bits for the first counter.
+ * We have already checked that all counters have the
+ * same values for these bits as the first counter.
+ */
+ counter = cpuhw->counter[0];
+ if (counter->attr.exclude_user)
+ cpuhw->mmcr[0] |= MMCR0_FCP;
+ if (counter->attr.exclude_kernel)
+ cpuhw->mmcr[0] |= freeze_counters_kernel;
+ if (counter->attr.exclude_hv)
+ cpuhw->mmcr[0] |= MMCR0_FCHV;
+
+ /*
+ * Write the new configuration to MMCR* with the freeze
+ * bit set and set the hardware counters to their initial values.
+ * Then unfreeze the counters.
+ */
+ get_lppaca()->pmcregs_in_use = 1;
+ mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+ mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+ mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+ | MMCR0_FC);
+
+ /*
+ * Read off any pre-existing counters that need to move
+ * to another PMC.
+ */
+ for (i = 0; i < cpuhw->n_counters; ++i) {
+ counter = cpuhw->counter[i];
+ if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+ power_pmu_read(counter);
+ write_pmc(counter->hw.idx, 0);
+ counter->hw.idx = 0;
+ }
+ }
+
+ /*
+ * Initialize the PMCs for all the new and moved counters.
+ */
+ cpuhw->n_limited = n_lim = 0;
+ for (i = 0; i < cpuhw->n_counters; ++i) {
+ counter = cpuhw->counter[i];
+ if (counter->hw.idx)
+ continue;
+ idx = hwc_index[i] + 1;
+ if (is_limited_pmc(idx)) {
+ cpuhw->limited_counter[n_lim] = counter;
+ cpuhw->limited_hwidx[n_lim] = idx;
+ ++n_lim;
+ continue;
+ }
+ val = 0;
+ if (counter->hw.sample_period) {
+ left = atomic64_read(&counter->hw.period_left);
+ if (left < 0x80000000L)
+ val = 0x80000000L - left;
+ }
+ atomic64_set(&counter->hw.prev_count, val);
+ counter->hw.idx = idx;
+ write_pmc(idx, val);
+ perf_counter_update_userpage(counter);
+ }
+ cpuhw->n_limited = n_lim;
+ cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+ mb();
+ write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+ /*
+ * Enable instruction sampling if necessary
+ */
+ if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+ mb();
+ mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+ }
+
+ out:
+ local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_counter *group, int max_count,
+ struct perf_counter *ctrs[], u64 *events,
+ unsigned int *flags)
+{
+ int n = 0;
+ struct perf_counter *counter;
+
+ if (!is_software_counter(group)) {
+ if (n >= max_count)
+ return -1;
+ ctrs[n] = group;
+ flags[n] = group->hw.counter_base;
+ events[n++] = group->hw.config;
+ }
+ list_for_each_entry(counter, &group->sibling_list, list_entry) {
+ if (!is_software_counter(counter) &&
+ counter->state != PERF_COUNTER_STATE_OFF) {
+ if (n >= max_count)
+ return -1;
+ ctrs[n] = counter;
+ flags[n] = counter->hw.counter_base;
+ events[n++] = counter->hw.config;
+ }
+ }
+ return n;
+}
+
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
+ counter->oncpu = cpu;
+ counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
+ if (is_software_counter(counter))
+ counter->pmu->enable(counter);
+}
+
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx, int cpu)
+{
+ struct cpu_hw_counters *cpuhw;
+ long i, n, n0;
+ struct perf_counter *sub;
+
+ cpuhw = &__get_cpu_var(cpu_hw_counters);
+ n0 = cpuhw->n_counters;
+ n = collect_events(group_leader, ppmu->n_counter - n0,
+ &cpuhw->counter[n0], &cpuhw->events[n0],
+ &cpuhw->flags[n0]);
+ if (n < 0)
+ return -EAGAIN;
+ if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
+ return -EAGAIN;
+ i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+ if (i < 0)
+ return -EAGAIN;
+ cpuhw->n_counters = n0 + n;
+ cpuhw->n_added += n;
+
+ /*
+ * OK, this group can go on; update counter states etc.,
+ * and enable any software counters
+ */
+ for (i = n0; i < n0 + n; ++i)
+ cpuhw->counter[i]->hw.config = cpuhw->events[i];
+ cpuctx->active_oncpu += n;
+ n = 1;
+ counter_sched_in(group_leader, cpu);
+ list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+ if (sub->state != PERF_COUNTER_STATE_OFF) {
+ counter_sched_in(sub, cpu);
+ ++n;
+ }
+ }
+ ctx->nr_active += n;
+
+ return 1;
+}
+
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_enable to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_pmu_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuhw;
+ unsigned long flags;
+ int n0;
+ int ret = -EAGAIN;
+
+ local_irq_save(flags);
+ perf_disable();
+
+ /*
+ * Add the counter to the list (if there is room)
+ * and check whether the total set is still feasible.
+ */
+ cpuhw = &__get_cpu_var(cpu_hw_counters);
+ n0 = cpuhw->n_counters;
+ if (n0 >= ppmu->n_counter)
+ goto out;
+ cpuhw->counter[n0] = counter;
+ cpuhw->events[n0] = counter->hw.config;
+ cpuhw->flags[n0] = counter->hw.counter_base;
+ if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
+ goto out;
+ if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
+ goto out;
+
+ counter->hw.config = cpuhw->events[n0];
+ ++cpuhw->n_counters;
+ ++cpuhw->n_added;
+
+ ret = 0;
+ out:
+ perf_enable();
+ local_irq_restore(flags);
+ return ret;
+}
+
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_pmu_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuhw;
+ long i;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ perf_disable();
+
+ power_pmu_read(counter);
+
+ cpuhw = &__get_cpu_var(cpu_hw_counters);
+ for (i = 0; i < cpuhw->n_counters; ++i) {
+ if (counter == cpuhw->counter[i]) {
+ while (++i < cpuhw->n_counters)
+ cpuhw->counter[i-1] = cpuhw->counter[i];
+ --cpuhw->n_counters;
+ ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+ if (counter->hw.idx) {
+ write_pmc(counter->hw.idx, 0);
+ counter->hw.idx = 0;
+ }
+ perf_counter_update_userpage(counter);
+ break;
+ }
+ }
+ for (i = 0; i < cpuhw->n_limited; ++i)
+ if (counter == cpuhw->limited_counter[i])
+ break;
+ if (i < cpuhw->n_limited) {
+ while (++i < cpuhw->n_limited) {
+ cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+ cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+ }
+ --cpuhw->n_limited;
+ }
+ if (cpuhw->n_counters == 0) {
+ /* disable exceptions if no counters are running */
+ cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+ }
+
+ perf_enable();
+ local_irq_restore(flags);
+}
+
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+ s64 val, left;
+ unsigned long flags;
+
+ if (!counter->hw.idx || !counter->hw.sample_period)
+ return;
+ local_irq_save(flags);
+ perf_disable();
+ power_pmu_read(counter);
+ left = counter->hw.sample_period;
+ counter->hw.last_period = left;
+ val = 0;
+ if (left < 0x80000000L)
+ val = 0x80000000L - left;
+ write_pmc(counter->hw.idx, val);
+ atomic64_set(&counter->hw.prev_count, val);
+ atomic64_set(&counter->hw.period_left, left);
+ perf_counter_update_userpage(counter);
+ perf_enable();
+ local_irq_restore(flags);
+}
+
+struct pmu power_pmu = {
+ .enable = power_pmu_enable,
+ .disable = power_pmu_disable,
+ .read = power_pmu_read,
+ .unthrottle = power_pmu_unthrottle,
+};
+
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
+ unsigned int flags)
+{
+ int n;
+ u64 alt[MAX_EVENT_ALTERNATIVES];
+
+ if (counter->attr.exclude_user
+ || counter->attr.exclude_kernel
+ || counter->attr.exclude_hv
+ || counter->attr.sample_period)
+ return 0;
+
+ if (ppmu->limited_pmc_event(ev))
+ return 1;
+
+ /*
+ * The requested event isn't on a limited PMC already;
+ * see if any alternative code goes on a limited PMC.
+ */
+ if (!ppmu->get_alternatives)
+ return 0;
+
+ flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+ n = ppmu->get_alternatives(ev, flags, alt);
+
+ return n > 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
+{
+ u64 alt[MAX_EVENT_ALTERNATIVES];
+ int n;
+
+ flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+ n = ppmu->get_alternatives(ev, flags, alt);
+ if (!n)
+ return 0;
+ return alt[0];
+}
+
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+ if (!atomic_add_unless(&num_counters, -1, 1)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_dec_return(&num_counters) == 0)
+ release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+/*
+ * Translate a generic cache event config to a raw event code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+ unsigned long type, op, result;
+ int ev;
+
+ if (!ppmu->cache_events)
+ return -EINVAL;
+
+ /* unpack config */
+ type = config & 0xff;
+ op = (config >> 8) & 0xff;
+ result = (config >> 16) & 0xff;
+
+ if (type >= PERF_COUNT_HW_CACHE_MAX ||
+ op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+ result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+
+ ev = (*ppmu->cache_events)[type][op][result];
+ if (ev == 0)
+ return -EOPNOTSUPP;
+ if (ev == -1)
+ return -EINVAL;
+ *eventp = ev;
+ return 0;
+}
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ u64 ev;
+ unsigned long flags;
+ struct perf_counter *ctrs[MAX_HWCOUNTERS];
+ u64 events[MAX_HWCOUNTERS];
+ unsigned int cflags[MAX_HWCOUNTERS];
+ int n;
+ int err;
+
+ if (!ppmu)
+ return ERR_PTR(-ENXIO);
+ switch (counter->attr.type) {
+ case PERF_TYPE_HARDWARE:
+ ev = counter->attr.config;
+ if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
+ return ERR_PTR(-EOPNOTSUPP);
+ ev = ppmu->generic_events[ev];
+ break;
+ case PERF_TYPE_HW_CACHE:
+ err = hw_perf_cache_event(counter->attr.config, &ev);
+ if (err)
+ return ERR_PTR(err);
+ break;
+ case PERF_TYPE_RAW:
+ ev = counter->attr.config;
+ break;
+ }
+ counter->hw.config_base = ev;
+ counter->hw.idx = 0;
+
+ /*
+ * If we are not running on a hypervisor, force the
+ * exclude_hv bit to 0 so that we don't care what
+ * the user set it to.
+ */
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ counter->attr.exclude_hv = 0;
+
+ /*
+ * If this is a per-task counter, then we can use
+ * PM_RUN_* events interchangeably with their non RUN_*
+ * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+ * XXX we should check if the task is an idle task.
+ */
+ flags = 0;
+ if (counter->ctx->task)
+ flags |= PPMU_ONLY_COUNT_RUN;
+
+ /*
+ * If this machine has limited counters, check whether this
+ * event could go on a limited counter.
+ */
+ if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
+ if (can_go_on_limited_pmc(counter, ev, flags)) {
+ flags |= PPMU_LIMITED_PMC_OK;
+ } else if (ppmu->limited_pmc_event(ev)) {
+ /*
+ * The requested event is on a limited PMC,
+ * but we can't use a limited PMC; see if any
+ * alternative goes on a normal PMC.
+ */
+ ev = normal_pmc_alternative(ev, flags);
+ if (!ev)
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ /*
+ * If this is in a group, check if it can go on with all the
+ * other hardware counters in the group. We assume the counter
+ * hasn't been linked into its leader's sibling list at this point.
+ */
+ n = 0;
+ if (counter->group_leader != counter) {
+ n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+ ctrs, events, cflags);
+ if (n < 0)
+ return ERR_PTR(-EINVAL);
+ }
+ events[n] = ev;
+ ctrs[n] = counter;
+ cflags[n] = flags;
+ if (check_excludes(ctrs, cflags, n, 1))
+ return ERR_PTR(-EINVAL);
+ if (power_check_constraints(events, cflags, n + 1))
+ return ERR_PTR(-EINVAL);
+
+ counter->hw.config = events[n];
+ counter->hw.counter_base = cflags[n];
+ counter->hw.last_period = counter->hw.sample_period;
+ atomic64_set(&counter->hw.period_left, counter->hw.last_period);
+
+ /*
+ * See if we need to reserve the PMU.
+ * If no counters are currently in use, then we have to take a
+ * mutex to ensure that we don't race with another task doing
+ * reserve_pmc_hardware or release_pmc_hardware.
+ */
+ err = 0;
+ if (!atomic_inc_not_zero(&num_counters)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&num_counters) == 0 &&
+ reserve_pmc_hardware(perf_counter_interrupt))
+ err = -EBUSY;
+ else
+ atomic_inc(&num_counters);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ counter->destroy = hw_perf_counter_destroy;
+
+ if (err)
+ return ERR_PTR(err);
+ return &power_pmu;
+}
+
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested. Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+ struct pt_regs *regs, int nmi)
+{
+ u64 period = counter->hw.sample_period;
+ s64 prev, delta, left;
+ int record = 0;
+ u64 addr, mmcra, sdsync;
+
+ /* we don't have to worry about interrupts here */
+ prev = atomic64_read(&counter->hw.prev_count);
+ delta = (val - prev) & 0xfffffffful;
+ atomic64_add(delta, &counter->count);
+
+ /*
+ * See if the total period for this counter has expired,
+ * and update for the next period.
+ */
+ val = 0;
+ left = atomic64_read(&counter->hw.period_left) - delta;
+ if (period) {
+ if (left <= 0) {
+ left += period;
+ if (left <= 0)
+ left = period;
+ record = 1;
+ }
+ if (left < 0x80000000L)
+ val = 0x80000000L - left;
+ }
+
+ /*
+ * Finally record data if requested.
+ */
+ if (record) {
+ struct perf_sample_data data = {
+ .regs = regs,
+ .addr = 0,
+ .period = counter->hw.last_period,
+ };
+
+ if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
+ /*
+ * The user wants a data address recorded.
+ * If we're not doing instruction sampling,
+ * give them the SDAR (sampled data address).
+ * If we are doing instruction sampling, then only
+ * give them the SDAR if it corresponds to the
+ * instruction pointed to by SIAR; this is indicated
+ * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+ */
+ mmcra = regs->dsisr;
+ sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+ POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+ if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+ data.addr = mfspr(SPRN_SDAR);
+ }
+ if (perf_counter_overflow(counter, nmi, &data)) {
+ /*
+ * Interrupts are coming too fast - throttle them
+ * by setting the counter to 0, so it will be
+ * at least 2^30 cycles until the next interrupt
+ * (assuming each counter counts at most 2 counts
+ * per cycle).
+ */
+ val = 0;
+ left = ~0ULL >> 1;
+ }
+ }
+
+ write_pmc(counter->hw.idx, val);
+ atomic64_set(&counter->hw.prev_count, val);
+ atomic64_set(&counter->hw.period_left, left);
+ perf_counter_update_userpage(counter);
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ unsigned long mmcra;
+
+ if (TRAP(regs) != 0xf00) {
+ /* not a PMU interrupt */
+ return user_mode(regs) ? PERF_EVENT_MISC_USER :
+ PERF_EVENT_MISC_KERNEL;
+ }
+
+ mmcra = regs->dsisr;
+ if (ppmu->flags & PPMU_ALT_SIPR) {
+ if (mmcra & POWER6_MMCRA_SIHV)
+ return PERF_EVENT_MISC_HYPERVISOR;
+ return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+ PERF_EVENT_MISC_KERNEL;
+ }
+ if (mmcra & MMCRA_SIHV)
+ return PERF_EVENT_MISC_HYPERVISOR;
+ return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+ PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+ unsigned long mmcra;
+ unsigned long ip;
+ unsigned long slot;
+
+ if (TRAP(regs) != 0xf00)
+ return regs->nip; /* not a PMU interrupt */
+
+ ip = mfspr(SPRN_SIAR);
+ mmcra = regs->dsisr;
+ if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+ slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+ if (slot > 1)
+ ip += 4 * (slot - 1);
+ }
+ return ip;
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+ int i;
+ struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+ struct perf_counter *counter;
+ long val;
+ int found = 0;
+ int nmi;
+
+ if (cpuhw->n_limited)
+ freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+ mfspr(SPRN_PMC6));
+
+ /*
+ * Overload regs->dsisr to store MMCRA so we only need to read it once.
+ */
+ regs->dsisr = mfspr(SPRN_MMCRA);
+
+ /*
+ * If interrupts were soft-disabled when this PMU interrupt
+ * occurred, treat it as an NMI.
+ */
+ nmi = !regs->softe;
+ if (nmi)
+ nmi_enter();
+ else
+ irq_enter();
+
+ for (i = 0; i < cpuhw->n_counters; ++i) {
+ counter = cpuhw->counter[i];
+ if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
+ continue;
+ val = read_pmc(counter->hw.idx);
+ if ((int)val < 0) {
+ /* counter has overflowed */
+ found = 1;
+ record_and_restart(counter, val, regs, nmi);
+ }
+ }
+
+ /*
+ * In case we didn't find and reset the counter that caused
+ * the interrupt, scan all counters and reset any that are
+ * negative, to avoid getting continual interrupts.
+ * Any that we processed in the previous loop will not be negative.
+ */
+ if (!found) {
+ for (i = 0; i < ppmu->n_counter; ++i) {
+ if (is_limited_pmc(i + 1))
+ continue;
+ val = read_pmc(i + 1);
+ if ((int)val < 0)
+ write_pmc(i + 1, 0);
+ }
+ }
+
+ /*
+ * Reset MMCR0 to its normal value. This will set PMXE and
+ * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+ * and thus allow interrupts to occur again.
+ * XXX might want to use MSR.PM to keep the counters frozen until
+ * we get back out of this interrupt.
+ */
+ write_mmcr0(cpuhw, cpuhw->mmcr[0]);
+
+ if (nmi)
+ nmi_exit();
+ else
+ irq_exit();
+}
+
+void hw_perf_counter_setup(int cpu)
+{
+ struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
+
+ memset(cpuhw, 0, sizeof(*cpuhw));
+ cpuhw->mmcr[0] = MMCR0_FC;
+}
+
+extern struct power_pmu power4_pmu;
+extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
+extern struct power_pmu power6_pmu;
+extern struct power_pmu power7_pmu;
+
+static int init_perf_counters(void)
+{
+ unsigned long pvr;
+
+ /* XXX should get this from cputable */
+ pvr = mfspr(SPRN_PVR);
+ switch (PVR_VER(pvr)) {
+ case PV_POWER4:
+ case PV_POWER4p:
+ ppmu = &power4_pmu;
+ break;
+ case PV_970:
+ case PV_970FX:
+ case PV_970MP:
+ ppmu = &ppc970_pmu;
+ break;
+ case PV_POWER5:
+ ppmu = &power5_pmu;
+ break;
+ case PV_POWER5p:
+ ppmu = &power5p_pmu;
+ break;
+ case 0x3e:
+ ppmu = &power6_pmu;
+ break;
+ case 0x3f:
+ ppmu = &power7_pmu;
+ break;
+ }
+
+ /*
+ * Use FCHV to ignore kernel events if MSR.HV is set.
+ */
+ if (mfmsr() & MSR_HV)
+ freeze_counters_kernel = MMCR0_FCHV;
+
+ return 0;
+}
+
+arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 00000000000..07bd308a5fa
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,598 @@
+/*
+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER4
+ */
+#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0xf
+#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK 0xf
+#define PM_LOWER_SH 6
+#define PM_LOWER_MSK 1
+#define PM_LOWER_MSKS 0x40
+#define PM_BYTE_SH 4 /* Byte number of event bus to use */
+#define PM_BYTE_MSK 3
+#define PM_PMCSEL_MSK 7
+
+/*
+ * Unit code values
+ */
+#define PM_FPU 1
+#define PM_ISU1 2
+#define PM_IFU 3
+#define PM_IDU0 4
+#define PM_ISU1_ALT 6
+#define PM_ISU2 7
+#define PM_IFU_ALT 8
+#define PM_LSU0 9
+#define PM_LSU1 0xc
+#define PM_GPS 0xf
+
+/*
+ * Bits in MMCR0 for POWER4
+ */
+#define MMCR0_PMC1SEL_SH 8
+#define MMCR0_PMC2SEL_SH 1
+#define MMCR_PMCSEL_MSK 0x1f
+
+/*
+ * Bits in MMCR1 for POWER4
+ */
+#define MMCR1_TTM0SEL_SH 62
+#define MMCR1_TTC0SEL_SH 61
+#define MMCR1_TTM1SEL_SH 59
+#define MMCR1_TTC1SEL_SH 58
+#define MMCR1_TTM2SEL_SH 56
+#define MMCR1_TTC2SEL_SH 55
+#define MMCR1_TTM3SEL_SH 53
+#define MMCR1_TTC3SEL_SH 52
+#define MMCR1_TTMSEL_MSK 3
+#define MMCR1_TD_CP_DBG0SEL_SH 50
+#define MMCR1_TD_CP_DBG1SEL_SH 48
+#define MMCR1_TD_CP_DBG2SEL_SH 46
+#define MMCR1_TD_CP_DBG3SEL_SH 44
+#define MMCR1_DEBUG0SEL_SH 43
+#define MMCR1_DEBUG1SEL_SH 42
+#define MMCR1_DEBUG2SEL_SH 41
+#define MMCR1_DEBUG3SEL_SH 40
+#define MMCR1_PMC1_ADDER_SEL_SH 39
+#define MMCR1_PMC2_ADDER_SEL_SH 38
+#define MMCR1_PMC6_ADDER_SEL_SH 37
+#define MMCR1_PMC5_ADDER_SEL_SH 36
+#define MMCR1_PMC8_ADDER_SEL_SH 35
+#define MMCR1_PMC7_ADDER_SEL_SH 34
+#define MMCR1_PMC3_ADDER_SEL_SH 33
+#define MMCR1_PMC4_ADDER_SEL_SH 32
+#define MMCR1_PMC3SEL_SH 27
+#define MMCR1_PMC4SEL_SH 22
+#define MMCR1_PMC5SEL_SH 17
+#define MMCR1_PMC6SEL_SH 12
+#define MMCR1_PMC7SEL_SH 7
+#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
+
+static short mmcr1_adder_bits[8] = {
+ MMCR1_PMC1_ADDER_SEL_SH,
+ MMCR1_PMC2_ADDER_SEL_SH,
+ MMCR1_PMC3_ADDER_SEL_SH,
+ MMCR1_PMC4_ADDER_SEL_SH,
+ MMCR1_PMC5_ADDER_SEL_SH,
+ MMCR1_PMC6_ADDER_SEL_SH,
+ MMCR1_PMC7_ADDER_SEL_SH,
+ MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
+ * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
+ * \SMPL ||\TTC3SEL
+ * |\TTC_IFU_SEL
+ * \TTM2SEL0
+ *
+ * SMPL - SAMPLE_ENABLE constraint
+ * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
+ *
+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
+ * 55: UC1 error 0x0080_0000_0000_0000
+ * 54: FPU events needed 0x0040_0000_0000_0000
+ * 53: ISU1 events needed 0x0020_0000_0000_0000
+ * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
+ *
+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
+ * 51: UC2 error 0x0008_0000_0000_0000
+ * 50: FPU events needed 0x0004_0000_0000_0000
+ * 49: IFU events needed 0x0002_0000_0000_0000
+ * 48: LSU0 events needed 0x0001_0000_0000_0000
+ *
+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
+ * 47: UC3 error 0x8000_0000_0000
+ * 46: LSU0 events needed 0x4000_0000_0000
+ * 45: IFU events needed 0x2000_0000_0000
+ * 44: IDU0|ISU2 events needed 0x1000_0000_0000
+ * 43: ISU1 events needed 0x0800_0000_0000
+ *
+ * TTM2SEL0
+ * 42: 0 = IDU0 events needed
+ * 1 = ISU2 events needed 0x0400_0000_0000
+ *
+ * TTC_IFU_SEL
+ * 41: 0 = IFU.U events needed
+ * 1 = IFU.L events needed 0x0200_0000_0000
+ *
+ * TTC3SEL
+ * 40: 0 = LSU1.U events needed
+ * 1 = LSU1.L events needed 0x0100_0000_0000
+ *
+ * PS1
+ * 39: PS1 error 0x0080_0000_0000
+ * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ * 35: PS2 error 0x0008_0000_0000
+ * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ * 28-31: Byte 0 event source 0xf000_0000
+ * 1 = FPU
+ * 2 = ISU1
+ * 3 = IFU
+ * 4 = IDU0
+ * 7 = ISU2
+ * 9 = LSU0
+ * c = LSU1
+ * f = GPS
+ *
+ * B1, B2, B3
+ * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P8
+ * 15: P8 error 0x8000
+ * 14-15: Count of events needing PMC8
+ *
+ * P1..P7
+ * 0-13: Count of events needing PMC1..PMC7
+ *
+ * Note: this doesn't allow events using IFU.U to be combined with events
+ * using IFU.L, though that is feasible (using TTM0 and TTM2). However
+ * there are no listed events for IFU.L (they are debug events not
+ * verified for performance monitoring) so this shouldn't cause a
+ * problem.
+ */
+
+static struct unitinfo {
+ u64 value, mask;
+ int unit;
+ int lowerbit;
+} p4_unitinfo[16] = {
+ [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
+ [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+ [PM_ISU1_ALT] =
+ { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+ [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+ [PM_IFU_ALT] =
+ { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+ [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
+ [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
+ [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
+ [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
+ [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+};
+
+static unsigned char direct_marked_event[8] = {
+ (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+ (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+ (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
+ (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+ (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
+ (1<<3) | (1<<4) | (1<<5),
+ /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+ (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+ (1<<4), /* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p4_marked_instr_event(u64 event)
+{
+ int pmc, psel, unit, byte, bit;
+ unsigned int mask;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = event & PM_PMCSEL_MSK;
+ if (pmc) {
+ if (direct_marked_event[pmc - 1] & (1 << psel))
+ return 1;
+ if (psel == 0) /* add events */
+ bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+ else if (psel == 6) /* decode events */
+ bit = 4;
+ else
+ return 0;
+ } else
+ bit = psel;
+
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ mask = 0;
+ switch (unit) {
+ case PM_LSU1:
+ if (event & PM_LOWER_MSKS)
+ mask = 1 << 28; /* byte 7 bit 4 */
+ else
+ mask = 6 << 24; /* byte 3 bits 1 and 2 */
+ break;
+ case PM_LSU0:
+ /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
+ mask = 0x083dff00;
+ }
+ return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, byte, unit, lower, sh;
+ u64 mask = 0, value = 0;
+ int grp = -1;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 8)
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ grp = ((pmc - 1) >> 1) & 1;
+ }
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (unit) {
+ lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
+
+ /*
+ * Bus events on bytes 0 and 2 can be counted
+ * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+ */
+ if (!pmc)
+ grp = byte & 1;
+
+ if (!p4_unitinfo[unit].unit)
+ return -1;
+ mask |= p4_unitinfo[unit].mask;
+ value |= p4_unitinfo[unit].value;
+ sh = p4_unitinfo[unit].lowerbit;
+ if (sh > 1)
+ value |= (u64)lower << sh;
+ else if (lower != sh)
+ return -1;
+ unit = p4_unitinfo[unit].unit;
+
+ /* Set byte lane select field */
+ mask |= 0xfULL << (28 - 4 * byte);
+ value |= (u64)unit << (28 - 4 * byte);
+ }
+ if (grp == 0) {
+ /* increment PMC1/2/5/6 field */
+ mask |= 0x8000000000ull;
+ value |= 0x1000000000ull;
+ } else {
+ /* increment PMC3/4/7/8 field */
+ mask |= 0x800000000ull;
+ value |= 0x100000000ull;
+ }
+
+ /* Marked instruction events need sample_enable set */
+ if (p4_marked_instr_event(event)) {
+ mask |= 1ull << 56;
+ value |= 1ull << 56;
+ }
+
+ /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
+ if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
+ mask |= 1ull << 56;
+
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+static unsigned int ppc_inst_cmpl[] = {
+ 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
+};
+
+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ int i, j, na;
+
+ alt[0] = event;
+ na = 1;
+
+ /* 2 possibilities for PM_GRP_DISP_REJECT */
+ if (event == 0x8003 || event == 0x0224) {
+ alt[1] = event ^ (0x8003 ^ 0x0224);
+ return 2;
+ }
+
+ /* 2 possibilities for PM_ST_MISS_L1 */
+ if (event == 0x0c13 || event == 0x0c23) {
+ alt[1] = event ^ (0x0c13 ^ 0x0c23);
+ return 2;
+ }
+
+ /* several possibilities for PM_INST_CMPL */
+ for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
+ if (event == ppc_inst_cmpl[i]) {
+ for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
+ if (j != i)
+ alt[na++] = ppc_inst_cmpl[j];
+ break;
+ }
+ }
+
+ return na;
+}
+
+static int p4_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+ unsigned int pmc, unit, byte, psel, lower;
+ unsigned int ttm, grp;
+ unsigned int pmc_inuse = 0;
+ unsigned int pmc_grp_use[2];
+ unsigned char busbyte[4];
+ unsigned char unituse[16];
+ unsigned int unitlower = 0;
+ int i;
+
+ if (n_ev > 8)
+ return -1;
+
+ /* First pass to count resource use */
+ pmc_grp_use[0] = pmc_grp_use[1] = 0;
+ memset(busbyte, 0, sizeof(busbyte));
+ memset(unituse, 0, sizeof(unituse));
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1;
+ pmc_inuse |= 1 << (pmc - 1);
+ /* count 1/2/5/6 vs 3/4/7/8 use */
+ ++pmc_grp_use[((pmc - 1) >> 1) & 1];
+ }
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
+ if (unit) {
+ if (!pmc)
+ ++pmc_grp_use[byte & 1];
+ if (unit == 6 || unit == 8)
+ /* map alt ISU1/IFU codes: 6->2, 8->3 */
+ unit = (unit >> 1) - 1;
+ if (busbyte[byte] && busbyte[byte] != unit)
+ return -1;
+ busbyte[byte] = unit;
+ lower <<= unit;
+ if (unituse[unit] && lower != (unitlower & lower))
+ return -1;
+ unituse[unit] = 1;
+ unitlower |= lower;
+ }
+ }
+ if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+ return -1;
+
+ /*
+ * Assign resources and set multiplexer selects.
+ *
+ * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
+ * Each TTMx can only select one unit, but since
+ * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
+ * we have some choices.
+ */
+ if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
+ unituse[6] = 1; /* Move 2 to 6 */
+ unituse[2] = 0;
+ }
+ if (unituse[3] & (unituse[1] | unituse[2])) {
+ unituse[8] = 1; /* Move 3 to 8 */
+ unituse[3] = 0;
+ unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
+ }
+ /* Check only one unit per TTMx */
+ if (unituse[1] + unituse[2] + unituse[3] > 1 ||
+ unituse[4] + unituse[6] + unituse[7] > 1 ||
+ unituse[8] + unituse[9] > 1 ||
+ (unituse[5] | unituse[10] | unituse[11] |
+ unituse[13] | unituse[14]))
+ return -1;
+
+ /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
+ mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
+ mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
+ mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+
+ /* Set TTCxSEL fields. */
+ if (unitlower & 0xe)
+ mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
+ if (unitlower & 0xf0)
+ mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
+ if (unitlower & 0xf00)
+ mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
+ if (unitlower & 0x7000)
+ mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
+
+ /* Set byte lane select fields. */
+ for (byte = 0; byte < 4; ++byte) {
+ unit = busbyte[byte];
+ if (!unit)
+ continue;
+ if (unit == 0xf) {
+ /* special case for GPS */
+ mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
+ } else {
+ if (!unituse[unit])
+ ttm = unit - 1; /* 2->1, 3->2 */
+ else
+ ttm = unit >> 2;
+ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+ }
+ }
+
+ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ psel = event[i] & PM_PMCSEL_MSK;
+ if (!pmc) {
+ /* Bus event or 00xxx direct event (off or cycles) */
+ if (unit)
+ psel |= 0x10 | ((byte & 2) << 2);
+ for (pmc = 0; pmc < 8; ++pmc) {
+ if (pmc_inuse & (1 << pmc))
+ continue;
+ grp = (pmc >> 1) & 1;
+ if (unit) {
+ if (grp == (byte & 1))
+ break;
+ } else if (pmc_grp_use[grp] < 4) {
+ ++pmc_grp_use[grp];
+ break;
+ }
+ }
+ pmc_inuse |= 1 << pmc;
+ } else {
+ /* Direct event */
+ --pmc;
+ if (psel == 0 && (byte & 2))
+ /* add events on higher-numbered bus */
+ mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+ else if (psel == 6 && byte == 3)
+ /* seem to need to set sample_enable here */
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ psel |= 8;
+ }
+ if (pmc <= 1)
+ mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
+ else
+ mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+ if (pmc == 7) /* PMC8 */
+ mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
+ hwc[i] = pmc;
+ if (p4_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ }
+
+ if (pmc_inuse & 1)
+ mmcr0 |= MMCR0_PMC1CE;
+ if (pmc_inuse & 0xfe)
+ mmcr0 |= MMCR0_PMCjCE;
+
+ mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
+
+ /* Return MMCRx values */
+ mmcr[0] = mmcr0;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ /*
+ * Setting the PMCxSEL field to 0 disables PMC x.
+ * (Note that pmc is 0-based here, not 1-based.)
+ */
+ if (pmc <= 1) {
+ mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
+ } else {
+ mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
+ if (pmc == 7)
+ mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
+ }
+}
+
+static int p4_generic_events[] = {
+ [PERF_COUNT_HW_CPU_CYCLES] = 7,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x8c10, 0x3c10 },
+ [C(OP_WRITE)] = { 0x7c10, 0xc13 },
+ [C(OP_PREFETCH)] = { 0xc35, 0 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { 0, 0 },
+ [C(OP_PREFETCH)] = { 0xc34, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x904 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x900 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x330, 0x331 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu power4_pmu = {
+ .n_counter = 8,
+ .max_alternatives = 5,
+ .add_fields = 0x0000001100005555ull,
+ .test_adder = 0x0011083300000000ull,
+ .compute_mmcr = p4_compute_mmcr,
+ .get_constraint = p4_get_constraint,
+ .get_alternatives = p4_get_alternatives,
+ .disable_pmc = p4_disable_pmc,
+ .n_generic = ARRAY_SIZE(p4_generic_events),
+ .generic_events = p4_generic_events,
+ .cache_events = &power4_cache_events,
+};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 00000000000..41e5d2d958d
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,671 @@
+/*
+ * Performance counter support for POWER5+/++ (not POWER5) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
+ */
+#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0xf
+#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK 0xf
+#define PM_BYTE_SH 12 /* Byte number of event bus to use */
+#define PM_BYTE_MSK 7
+#define PM_GRS_SH 8 /* Storage subsystem mux select */
+#define PM_GRS_MSK 7
+#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
+#define PM_PMCSEL_MSK 0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU 0
+#define PM_ISU0 1
+#define PM_IFU 2
+#define PM_ISU1 3
+#define PM_IDU 4
+#define PM_ISU0_ALT 6
+#define PM_GRS 7
+#define PM_LSU0 8
+#define PM_LSU1 0xc
+#define PM_LASTUNIT 0xc
+
+/*
+ * Bits in MMCR1 for POWER5+
+ */
+#define MMCR1_TTM0SEL_SH 62
+#define MMCR1_TTM1SEL_SH 60
+#define MMCR1_TTM2SEL_SH 58
+#define MMCR1_TTM3SEL_SH 56
+#define MMCR1_TTMSEL_MSK 3
+#define MMCR1_TD_CP_DBG0SEL_SH 54
+#define MMCR1_TD_CP_DBG1SEL_SH 52
+#define MMCR1_TD_CP_DBG2SEL_SH 50
+#define MMCR1_TD_CP_DBG3SEL_SH 48
+#define MMCR1_GRS_L2SEL_SH 46
+#define MMCR1_GRS_L2SEL_MSK 3
+#define MMCR1_GRS_L3SEL_SH 44
+#define MMCR1_GRS_L3SEL_MSK 3
+#define MMCR1_GRS_MCSEL_SH 41
+#define MMCR1_GRS_MCSEL_MSK 7
+#define MMCR1_GRS_FABSEL_SH 39
+#define MMCR1_GRS_FABSEL_MSK 3
+#define MMCR1_PMC1_ADDER_SEL_SH 35
+#define MMCR1_PMC2_ADDER_SEL_SH 34
+#define MMCR1_PMC3_ADDER_SEL_SH 33
+#define MMCR1_PMC4_ADDER_SEL_SH 32
+#define MMCR1_PMC1SEL_SH 25
+#define MMCR1_PMC2SEL_SH 17
+#define MMCR1_PMC3SEL_SH 9
+#define MMCR1_PMC4SEL_SH 1
+#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK 0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><>
+ * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ * 51: NC error 0x0008_0000_0000_0000
+ * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ * 46-47: GRS_L2SEL value
+ * 44-45: GRS_L3SEL value
+ * 41-44: GRS_MCSEL value
+ * 39-40: GRS_FABSEL value
+ * Note that these match up with their bit positions in MMCR1
+ *
+ * T0 - TTM0 constraint
+ * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
+ *
+ * T1 - TTM1 constraint
+ * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ * 33: UC3 error 0x02_0000_0000
+ * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
+ * 31: ISU0 events needed 0x01_8000_0000
+ * 30: IDU|GRS events needed 0x00_4000_0000
+ *
+ * B0
+ * 24-27: Byte 0 event source 0x0f00_0000
+ * Encoding as for the event code
+ *
+ * B1, B2, B3
+ * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P6
+ * 11: P6 error 0x800
+ * 10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ * 0-9: Count of events needing PMC1..PMC5
+ */
+
+static const int grsel_shift[8] = {
+ MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+ MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+ MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+ [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
+ [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
+ [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
+ [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
+ [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
+ [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
+};
+
+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, byte, unit, sh;
+ int bit, fmask;
+ u64 mask = 0, value = 0;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
+ return -1;
+ }
+ if (event & PM_BUSEVENT_MSK) {
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ if (unit > PM_LASTUNIT)
+ return -1;
+ if (unit == PM_ISU0_ALT)
+ unit = PM_ISU0;
+ mask |= unit_cons[unit][0];
+ value |= unit_cons[unit][1];
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (byte >= 4) {
+ if (unit != PM_LSU1)
+ return -1;
+ /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+ ++unit;
+ byte &= 3;
+ }
+ if (unit == PM_GRS) {
+ bit = event & 7;
+ fmask = (bit == 6)? 7: 3;
+ sh = grsel_shift[bit];
+ mask |= (u64)fmask << sh;
+ value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+ }
+ /* Set byte lane select field */
+ mask |= 0xfULL << (24 - 4 * byte);
+ value |= (u64)unit << (24 - 4 * byte);
+ }
+ if (pmc < 5) {
+ /* need a counter from PMC1-4 set */
+ mask |= 0x8000000000000ull;
+ value |= 0x1000000000000ull;
+ }
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+static int power5p_limited_pmc_event(u64 event)
+{
+ int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+ return pmc == 5 || pmc == 6;
+}
+
+#define MAX_ALT 3 /* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+ { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
+ { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
+ { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
+ { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
+ { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
+ { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
+ { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
+ { 0x100005, 0x600005 }, /* PM_RUN_CYC */
+ { 0x100009, 0x200009 }, /* PM_INST_CMPL */
+ { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
+ { 0x300009, 0x400009 }, /* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+ if (event < event_alternatives[i][0])
+ break;
+ for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+ if (event == event_alternatives[i][j])
+ return i;
+ }
+ return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+ /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
+ /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
+ /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
+ /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters. This returns the alternative
+ * event code for those that do, or -1 otherwise. This also handles
+ * alternative PCMSEL values for add events.
+ */
+static s64 find_alternative_bdecode(u64 event)
+{
+ int pmc, altpmc, pp, j;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc == 0 || pmc > 4)
+ return -1;
+ altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
+ pp = event & PM_PMCSEL_MSK;
+ for (j = 0; j < 4; ++j) {
+ if (bytedecode_alternatives[pmc - 1][j] == pp) {
+ return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+ (altpmc << PM_PMC_SH) |
+ bytedecode_alternatives[altpmc - 1][j];
+ }
+ }
+
+ /* new decode alternatives for power5+ */
+ if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
+ return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
+ if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
+ return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
+
+ /* alternative add event encodings */
+ if (pp == 0x10 || pp == 0x28)
+ return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
+ (altpmc << PM_PMC_SH);
+
+ return -1;
+}
+
+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ int i, j, nalt = 1;
+ int nlim;
+ s64 ae;
+
+ alt[0] = event;
+ nalt = 1;
+ nlim = power5p_limited_pmc_event(event);
+ i = find_alternative(event);
+ if (i >= 0) {
+ for (j = 0; j < MAX_ALT; ++j) {
+ ae = event_alternatives[i][j];
+ if (ae && ae != event)
+ alt[nalt++] = ae;
+ nlim += power5p_limited_pmc_event(ae);
+ }
+ } else {
+ ae = find_alternative_bdecode(event);
+ if (ae > 0)
+ alt[nalt++] = ae;
+ }
+
+ if (flags & PPMU_ONLY_COUNT_RUN) {
+ /*
+ * We're only counting in RUN state,
+ * so PM_CYC is equivalent to PM_RUN_CYC
+ * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+ * This doesn't include alternatives that don't provide
+ * any extra flexibility in assigning PMCs (e.g.
+ * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
+ * Note that even with these additional alternatives
+ * we never end up with more than 3 alternatives for any event.
+ */
+ j = nalt;
+ for (i = 0; i < nalt; ++i) {
+ switch (alt[i]) {
+ case 0xf: /* PM_CYC */
+ alt[j++] = 0x600005; /* PM_RUN_CYC */
+ ++nlim;
+ break;
+ case 0x600005: /* PM_RUN_CYC */
+ alt[j++] = 0xf;
+ break;
+ case 0x100009: /* PM_INST_CMPL */
+ alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
+ ++nlim;
+ break;
+ case 0x500009: /* PM_RUN_INST_CMPL */
+ alt[j++] = 0x100009; /* PM_INST_CMPL */
+ alt[j++] = 0x200009;
+ break;
+ }
+ }
+ nalt = j;
+ }
+
+ if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+ /* remove the limited PMC events */
+ j = 0;
+ for (i = 0; i < nalt; ++i) {
+ if (!power5p_limited_pmc_event(alt[i])) {
+ alt[j] = alt[i];
+ ++j;
+ }
+ }
+ nalt = j;
+ } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+ /* remove all but the limited PMC events */
+ j = 0;
+ for (i = 0; i < nalt; ++i) {
+ if (power5p_limited_pmc_event(alt[i])) {
+ alt[j] = alt[i];
+ ++j;
+ }
+ }
+ nalt = j;
+ }
+
+ return nalt;
+}
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+ 0, /* 00 */
+ 0x1f, /* 01 PM_IOPS_CMPL */
+ 0x2, /* 02 PM_MRK_GRP_DISP */
+ 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+ 0, /* 04 */
+ 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+ 0x80, /* 06 */
+ 0x80, /* 07 */
+ 0, 0, 0,/* 08 - 0a */
+ 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+ 0, /* 0c */
+ 0x80, /* 0d */
+ 0x80, /* 0e */
+ 0, /* 0f */
+ 0, /* 10 */
+ 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+ 0, /* 12 */
+ 0x10, /* 13 PM_MRK_GRP_CMPL */
+ 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+ 0x2, /* 15 PM_MRK_GRP_ISSUED */
+ 0x80, /* 16 */
+ 0x80, /* 17 */
+ 0, 0, 0, 0, 0,
+ 0x80, /* 1d */
+ 0x80, /* 1e */
+ 0, /* 1f */
+ 0x80, /* 20 */
+ 0x80, /* 21 */
+ 0x80, /* 22 */
+ 0x80, /* 23 */
+ 0x80, /* 24 */
+ 0x80, /* 25 */
+ 0x80, /* 26 */
+ 0x80, /* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5p_marked_instr_event(u64 event)
+{
+ int pmc, psel;
+ int bit, byte, unit;
+ u32 mask;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = event & PM_PMCSEL_MSK;
+ if (pmc >= 5)
+ return 0;
+
+ bit = -1;
+ if (psel < sizeof(direct_event_is_marked)) {
+ if (direct_event_is_marked[psel] & (1 << pmc))
+ return 1;
+ if (direct_event_is_marked[psel] & 0x80)
+ bit = 4;
+ else if (psel == 0x08)
+ bit = pmc - 1;
+ else if (psel == 0x10)
+ bit = 4 - pmc;
+ else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+ bit = 4;
+ } else if ((psel & 0x48) == 0x40) {
+ bit = psel & 7;
+ } else if (psel == 0x28) {
+ bit = pmc - 1;
+ } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
+ bit = 4;
+ }
+
+ if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+ return 0;
+
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ if (unit == PM_LSU0) {
+ /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+ mask = 0x5dff00;
+ } else if (unit == PM_LSU1 && byte >= 4) {
+ byte -= 4;
+ /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
+ mask = 0x5f11c000;
+ } else
+ return 0;
+
+ return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int power5p_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr1 = 0;
+ u64 mmcra = 0;
+ unsigned int pmc, unit, byte, psel;
+ unsigned int ttm;
+ int i, isbus, bit, grsel;
+ unsigned int pmc_inuse = 0;
+ unsigned char busbyte[4];
+ unsigned char unituse[16];
+ int ttmuse;
+
+ if (n_ev > 6)
+ return -1;
+
+ /* First pass to count resource use */
+ memset(busbyte, 0, sizeof(busbyte));
+ memset(unituse, 0, sizeof(unituse));
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1;
+ pmc_inuse |= 1 << (pmc - 1);
+ }
+ if (event[i] & PM_BUSEVENT_MSK) {
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (unit > PM_LASTUNIT)
+ return -1;
+ if (unit == PM_ISU0_ALT)
+ unit = PM_ISU0;
+ if (byte >= 4) {
+ if (unit != PM_LSU1)
+ return -1;
+ ++unit;
+ byte &= 3;
+ }
+ if (busbyte[byte] && busbyte[byte] != unit)
+ return -1;
+ busbyte[byte] = unit;
+ unituse[unit] = 1;
+ }
+ }
+
+ /*
+ * Assign resources and set multiplexer selects.
+ *
+ * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+ * choice we have to deal with.
+ */
+ if (unituse[PM_ISU0] &
+ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+ unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
+ unituse[PM_ISU0] = 0;
+ }
+ /* Set TTM[01]SEL fields. */
+ ttmuse = 0;
+ for (i = PM_FPU; i <= PM_ISU1; ++i) {
+ if (!unituse[i])
+ continue;
+ if (ttmuse++)
+ return -1;
+ mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+ }
+ ttmuse = 0;
+ for (; i <= PM_GRS; ++i) {
+ if (!unituse[i])
+ continue;
+ if (ttmuse++)
+ return -1;
+ mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+ }
+ if (ttmuse > 1)
+ return -1;
+
+ /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+ for (byte = 0; byte < 4; ++byte) {
+ unit = busbyte[byte];
+ if (!unit)
+ continue;
+ if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+ /* get ISU0 through TTM1 rather than TTM0 */
+ unit = PM_ISU0_ALT;
+ } else if (unit == PM_LSU1 + 1) {
+ /* select lower word of LSU1 for this byte */
+ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+ }
+ ttm = unit >> 2;
+ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+ }
+
+ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ psel = event[i] & PM_PMCSEL_MSK;
+ isbus = event[i] & PM_BUSEVENT_MSK;
+ if (!pmc) {
+ /* Bus event or any-PMC direct event */
+ for (pmc = 0; pmc < 4; ++pmc) {
+ if (!(pmc_inuse & (1 << pmc)))
+ break;
+ }
+ if (pmc >= 4)
+ return -1;
+ pmc_inuse |= 1 << pmc;
+ } else if (pmc <= 4) {
+ /* Direct event */
+ --pmc;
+ if (isbus && (byte & 2) &&
+ (psel == 8 || psel == 0x10 || psel == 0x28))
+ /* add events on higher-numbered bus */
+ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+ } else {
+ /* Instructions or run cycles on PMC5/6 */
+ --pmc;
+ }
+ if (isbus && unit == PM_GRS) {
+ bit = psel & 7;
+ grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+ mmcr1 |= (u64)grsel << grsel_shift[bit];
+ }
+ if (power5p_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
+ /* select alternate byte lane */
+ psel |= 0x10;
+ if (pmc <= 3)
+ mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+ hwc[i] = pmc;
+ }
+
+ /* Return MMCRx values */
+ mmcr[0] = 0;
+ if (pmc_inuse & 1)
+ mmcr[0] = MMCR0_PMC1CE;
+ if (pmc_inuse & 0x3e)
+ mmcr[0] |= MMCR0_PMCjCE;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ if (pmc <= 3)
+ mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5p_generic_events[] = {
+ [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x1c10a8, 0x3c1088 },
+ [C(OP_WRITE)] = { 0x2c10a8, 0xc10c3 },
+ [C(OP_PREFETCH)] = { 0xc70e7, -1 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { 0, 0 },
+ [C(OP_PREFETCH)] = { 0xc50c3, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0xc20e4, 0x800c4 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x800c0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x230e4, 0x230e5 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu power5p_pmu = {
+ .n_counter = 6,
+ .max_alternatives = MAX_ALT,
+ .add_fields = 0x7000000000055ull,
+ .test_adder = 0x3000040000000ull,
+ .compute_mmcr = power5p_compute_mmcr,
+ .get_constraint = power5p_get_constraint,
+ .get_alternatives = power5p_get_alternatives,
+ .disable_pmc = power5p_disable_pmc,
+ .limited_pmc_event = power5p_limited_pmc_event,
+ .flags = PPMU_LIMITED_PMC5_6,
+ .n_generic = ARRAY_SIZE(power5p_generic_events),
+ .generic_events = power5p_generic_events,
+ .cache_events = &power5p_cache_events,
+};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 00000000000..05600b66221
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,611 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5 (not POWER5++)
+ */
+#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0xf
+#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK 0xf
+#define PM_BYTE_SH 12 /* Byte number of event bus to use */
+#define PM_BYTE_MSK 7
+#define PM_GRS_SH 8 /* Storage subsystem mux select */
+#define PM_GRS_MSK 7
+#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
+#define PM_PMCSEL_MSK 0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU 0
+#define PM_ISU0 1
+#define PM_IFU 2
+#define PM_ISU1 3
+#define PM_IDU 4
+#define PM_ISU0_ALT 6
+#define PM_GRS 7
+#define PM_LSU0 8
+#define PM_LSU1 0xc
+#define PM_LASTUNIT 0xc
+
+/*
+ * Bits in MMCR1 for POWER5
+ */
+#define MMCR1_TTM0SEL_SH 62
+#define MMCR1_TTM1SEL_SH 60
+#define MMCR1_TTM2SEL_SH 58
+#define MMCR1_TTM3SEL_SH 56
+#define MMCR1_TTMSEL_MSK 3
+#define MMCR1_TD_CP_DBG0SEL_SH 54
+#define MMCR1_TD_CP_DBG1SEL_SH 52
+#define MMCR1_TD_CP_DBG2SEL_SH 50
+#define MMCR1_TD_CP_DBG3SEL_SH 48
+#define MMCR1_GRS_L2SEL_SH 46
+#define MMCR1_GRS_L2SEL_MSK 3
+#define MMCR1_GRS_L3SEL_SH 44
+#define MMCR1_GRS_L3SEL_MSK 3
+#define MMCR1_GRS_MCSEL_SH 41
+#define MMCR1_GRS_MCSEL_MSK 7
+#define MMCR1_GRS_FABSEL_SH 39
+#define MMCR1_GRS_FABSEL_MSK 3
+#define MMCR1_PMC1_ADDER_SEL_SH 35
+#define MMCR1_PMC2_ADDER_SEL_SH 34
+#define MMCR1_PMC3_ADDER_SEL_SH 33
+#define MMCR1_PMC4_ADDER_SEL_SH 32
+#define MMCR1_PMC1SEL_SH 25
+#define MMCR1_PMC2SEL_SH 17
+#define MMCR1_PMC3SEL_SH 9
+#define MMCR1_PMC4SEL_SH 1
+#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK 0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
+ * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
+ *
+ * T0 - TTM0 constraint
+ * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
+ *
+ * NC - number of counters
+ * 51: NC error 0x0008_0000_0000_0000
+ * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ * 46-47: GRS_L2SEL value
+ * 44-45: GRS_L3SEL value
+ * 41-44: GRS_MCSEL value
+ * 39-40: GRS_FABSEL value
+ * Note that these match up with their bit positions in MMCR1
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ * 37: UC3 error 0x20_0000_0000
+ * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
+ * 35: ISU0 events needed 0x08_0000_0000
+ * 34: IDU|GRS events needed 0x04_0000_0000
+ *
+ * PS1
+ * 33: PS1 error 0x2_0000_0000
+ * 31-32: count of events needing PMC1/2 0x1_8000_0000
+ *
+ * PS2
+ * 30: PS2 error 0x4000_0000
+ * 28-29: count of events needing PMC3/4 0x3000_0000
+ *
+ * B0
+ * 24-27: Byte 0 event source 0x0f00_0000
+ * Encoding as for the event code
+ *
+ * B1, B2, B3
+ * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P1..P6
+ * 0-11: Count of events needing PMC1..PMC6
+ */
+
+static const int grsel_shift[8] = {
+ MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+ MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+ MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+ [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
+ [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
+ [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
+ [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
+ [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
+ [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
+};
+
+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, byte, unit, sh;
+ int bit, fmask;
+ u64 mask = 0, value = 0;
+ int grp = -1;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ if (pmc <= 4)
+ grp = (pmc - 1) >> 1;
+ else if (event != 0x500009 && event != 0x600005)
+ return -1;
+ }
+ if (event & PM_BUSEVENT_MSK) {
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ if (unit > PM_LASTUNIT)
+ return -1;
+ if (unit == PM_ISU0_ALT)
+ unit = PM_ISU0;
+ mask |= unit_cons[unit][0];
+ value |= unit_cons[unit][1];
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (byte >= 4) {
+ if (unit != PM_LSU1)
+ return -1;
+ /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+ ++unit;
+ byte &= 3;
+ }
+ if (unit == PM_GRS) {
+ bit = event & 7;
+ fmask = (bit == 6)? 7: 3;
+ sh = grsel_shift[bit];
+ mask |= (u64)fmask << sh;
+ value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+ }
+ /*
+ * Bus events on bytes 0 and 2 can be counted
+ * on PMC1/2; bytes 1 and 3 on PMC3/4.
+ */
+ if (!pmc)
+ grp = byte & 1;
+ /* Set byte lane select field */
+ mask |= 0xfULL << (24 - 4 * byte);
+ value |= (u64)unit << (24 - 4 * byte);
+ }
+ if (grp == 0) {
+ /* increment PMC1/2 field */
+ mask |= 0x200000000ull;
+ value |= 0x080000000ull;
+ } else if (grp == 1) {
+ /* increment PMC3/4 field */
+ mask |= 0x40000000ull;
+ value |= 0x10000000ull;
+ }
+ if (pmc < 5) {
+ /* need a counter from PMC1-4 set */
+ mask |= 0x8000000000000ull;
+ value |= 0x1000000000000ull;
+ }
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+#define MAX_ALT 3 /* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+ { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
+ { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
+ { 0x100005, 0x600005 }, /* PM_RUN_CYC */
+ { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
+ { 0x300009, 0x400009 }, /* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+ if (event < event_alternatives[i][0])
+ break;
+ for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+ if (event == event_alternatives[i][j])
+ return i;
+ }
+ return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+ /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
+ /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
+ /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
+ /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters. This returns the alternative
+ * event code for those that do, or -1 otherwise.
+ */
+static s64 find_alternative_bdecode(u64 event)
+{
+ int pmc, altpmc, pp, j;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc == 0 || pmc > 4)
+ return -1;
+ altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
+ pp = event & PM_PMCSEL_MSK;
+ for (j = 0; j < 4; ++j) {
+ if (bytedecode_alternatives[pmc - 1][j] == pp) {
+ return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+ (altpmc << PM_PMC_SH) |
+ bytedecode_alternatives[altpmc - 1][j];
+ }
+ }
+ return -1;
+}
+
+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ int i, j, nalt = 1;
+ s64 ae;
+
+ alt[0] = event;
+ nalt = 1;
+ i = find_alternative(event);
+ if (i >= 0) {
+ for (j = 0; j < MAX_ALT; ++j) {
+ ae = event_alternatives[i][j];
+ if (ae && ae != event)
+ alt[nalt++] = ae;
+ }
+ } else {
+ ae = find_alternative_bdecode(event);
+ if (ae > 0)
+ alt[nalt++] = ae;
+ }
+ return nalt;
+}
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+ 0, /* 00 */
+ 0x1f, /* 01 PM_IOPS_CMPL */
+ 0x2, /* 02 PM_MRK_GRP_DISP */
+ 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+ 0, /* 04 */
+ 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+ 0x80, /* 06 */
+ 0x80, /* 07 */
+ 0, 0, 0,/* 08 - 0a */
+ 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+ 0, /* 0c */
+ 0x80, /* 0d */
+ 0x80, /* 0e */
+ 0, /* 0f */
+ 0, /* 10 */
+ 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+ 0, /* 12 */
+ 0x10, /* 13 PM_MRK_GRP_CMPL */
+ 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+ 0x2, /* 15 PM_MRK_GRP_ISSUED */
+ 0x80, /* 16 */
+ 0x80, /* 17 */
+ 0, 0, 0, 0, 0,
+ 0x80, /* 1d */
+ 0x80, /* 1e */
+ 0, /* 1f */
+ 0x80, /* 20 */
+ 0x80, /* 21 */
+ 0x80, /* 22 */
+ 0x80, /* 23 */
+ 0x80, /* 24 */
+ 0x80, /* 25 */
+ 0x80, /* 26 */
+ 0x80, /* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5_marked_instr_event(u64 event)
+{
+ int pmc, psel;
+ int bit, byte, unit;
+ u32 mask;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = event & PM_PMCSEL_MSK;
+ if (pmc >= 5)
+ return 0;
+
+ bit = -1;
+ if (psel < sizeof(direct_event_is_marked)) {
+ if (direct_event_is_marked[psel] & (1 << pmc))
+ return 1;
+ if (direct_event_is_marked[psel] & 0x80)
+ bit = 4;
+ else if (psel == 0x08)
+ bit = pmc - 1;
+ else if (psel == 0x10)
+ bit = 4 - pmc;
+ else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+ bit = 4;
+ } else if ((psel & 0x58) == 0x40)
+ bit = psel & 7;
+
+ if (!(event & PM_BUSEVENT_MSK))
+ return 0;
+
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ if (unit == PM_LSU0) {
+ /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+ mask = 0x5dff00;
+ } else if (unit == PM_LSU1 && byte >= 4) {
+ byte -= 4;
+ /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
+ mask = 0x5f00c0aa;
+ } else
+ return 0;
+
+ return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int power5_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr1 = 0;
+ u64 mmcra = 0;
+ unsigned int pmc, unit, byte, psel;
+ unsigned int ttm, grp;
+ int i, isbus, bit, grsel;
+ unsigned int pmc_inuse = 0;
+ unsigned int pmc_grp_use[2];
+ unsigned char busbyte[4];
+ unsigned char unituse[16];
+ int ttmuse;
+
+ if (n_ev > 6)
+ return -1;
+
+ /* First pass to count resource use */
+ pmc_grp_use[0] = pmc_grp_use[1] = 0;
+ memset(busbyte, 0, sizeof(busbyte));
+ memset(unituse, 0, sizeof(unituse));
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1;
+ pmc_inuse |= 1 << (pmc - 1);
+ /* count 1/2 vs 3/4 use */
+ if (pmc <= 4)
+ ++pmc_grp_use[(pmc - 1) >> 1];
+ }
+ if (event[i] & PM_BUSEVENT_MSK) {
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (unit > PM_LASTUNIT)
+ return -1;
+ if (unit == PM_ISU0_ALT)
+ unit = PM_ISU0;
+ if (byte >= 4) {
+ if (unit != PM_LSU1)
+ return -1;
+ ++unit;
+ byte &= 3;
+ }
+ if (!pmc)
+ ++pmc_grp_use[byte & 1];
+ if (busbyte[byte] && busbyte[byte] != unit)
+ return -1;
+ busbyte[byte] = unit;
+ unituse[unit] = 1;
+ }
+ }
+ if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
+ return -1;
+
+ /*
+ * Assign resources and set multiplexer selects.
+ *
+ * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+ * choice we have to deal with.
+ */
+ if (unituse[PM_ISU0] &
+ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+ unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
+ unituse[PM_ISU0] = 0;
+ }
+ /* Set TTM[01]SEL fields. */
+ ttmuse = 0;
+ for (i = PM_FPU; i <= PM_ISU1; ++i) {
+ if (!unituse[i])
+ continue;
+ if (ttmuse++)
+ return -1;
+ mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+ }
+ ttmuse = 0;
+ for (; i <= PM_GRS; ++i) {
+ if (!unituse[i])
+ continue;
+ if (ttmuse++)
+ return -1;
+ mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+ }
+ if (ttmuse > 1)
+ return -1;
+
+ /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+ for (byte = 0; byte < 4; ++byte) {
+ unit = busbyte[byte];
+ if (!unit)
+ continue;
+ if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+ /* get ISU0 through TTM1 rather than TTM0 */
+ unit = PM_ISU0_ALT;
+ } else if (unit == PM_LSU1 + 1) {
+ /* select lower word of LSU1 for this byte */
+ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+ }
+ ttm = unit >> 2;
+ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+ }
+
+ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ psel = event[i] & PM_PMCSEL_MSK;
+ isbus = event[i] & PM_BUSEVENT_MSK;
+ if (!pmc) {
+ /* Bus event or any-PMC direct event */
+ for (pmc = 0; pmc < 4; ++pmc) {
+ if (pmc_inuse & (1 << pmc))
+ continue;
+ grp = (pmc >> 1) & 1;
+ if (isbus) {
+ if (grp == (byte & 1))
+ break;
+ } else if (pmc_grp_use[grp] < 2) {
+ ++pmc_grp_use[grp];
+ break;
+ }
+ }
+ pmc_inuse |= 1 << pmc;
+ } else if (pmc <= 4) {
+ /* Direct event */
+ --pmc;
+ if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
+ /* add events on higher-numbered bus */
+ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+ } else {
+ /* Instructions or run cycles on PMC5/6 */
+ --pmc;
+ }
+ if (isbus && unit == PM_GRS) {
+ bit = psel & 7;
+ grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+ mmcr1 |= (u64)grsel << grsel_shift[bit];
+ }
+ if (power5_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ if (pmc <= 3)
+ mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+ hwc[i] = pmc;
+ }
+
+ /* Return MMCRx values */
+ mmcr[0] = 0;
+ if (pmc_inuse & 1)
+ mmcr[0] = MMCR0_PMC1CE;
+ if (pmc_inuse & 0x3e)
+ mmcr[0] |= MMCR0_PMCjCE;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ if (pmc <= 3)
+ mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5_generic_events[] = {
+ [PERF_COUNT_HW_CPU_CYCLES] = 0xf,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x4c1090, 0x3c1088 },
+ [C(OP_WRITE)] = { 0x3c1090, 0xc10c3 },
+ [C(OP_PREFETCH)] = { 0xc70e7, 0 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x3c309b },
+ [C(OP_WRITE)] = { 0, 0 },
+ [C(OP_PREFETCH)] = { 0xc50c3, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x2c4090, 0x800c4 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x800c0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x230e4, 0x230e5 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu power5_pmu = {
+ .n_counter = 6,
+ .max_alternatives = MAX_ALT,
+ .add_fields = 0x7000090000555ull,
+ .test_adder = 0x3000490000000ull,
+ .compute_mmcr = power5_compute_mmcr,
+ .get_constraint = power5_get_constraint,
+ .get_alternatives = power5_get_alternatives,
+ .disable_pmc = power5_disable_pmc,
+ .n_generic = ARRAY_SIZE(power5_generic_events),
+ .generic_events = power5_generic_events,
+ .cache_events = &power5_cache_events,
+};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 00000000000..46f74bebcfd
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,532 @@
+/*
+ * Performance counter support for POWER6 processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER6
+ */
+#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0x7
+#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
+#define PM_UNIT_MSK 0xf
+#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
+#define PM_LLAV 0x8000 /* Load lookahead match value */
+#define PM_LLA 0x4000 /* Load lookahead match enable */
+#define PM_BYTE_SH 12 /* Byte of event bus to use */
+#define PM_BYTE_MSK 3
+#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
+#define PM_SUBUNIT_MSK 7
+#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
+#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
+#define PM_BUSEVENT_MSK 0xf3700
+
+/*
+ * Bits in MMCR1 for POWER6
+ */
+#define MMCR1_TTM0SEL_SH 60
+#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
+#define MMCR1_TTMSEL_MSK 0xf
+#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
+#define MMCR1_NESTSEL_SH 45
+#define MMCR1_NESTSEL_MSK 0x7
+#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
+#define MMCR1_PMC1_LLA ((u64)1 << 44)
+#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
+#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
+#define MMCR1_PMC1SEL_SH 24
+#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK 0xff
+
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value >> 1.
+ * Bottom 4 bits are a map of which PMCs are interesting,
+ * top 4 bits say what sort of event:
+ * 0 = direct marked event,
+ * 1 = byte decode event,
+ * 4 = add/and event (PMC1 -> bits 0 & 4),
+ * 5 = add/and event (PMC1 -> bits 1 & 5),
+ * 6 = add/and event (PMC1 -> bits 2 & 6),
+ * 7 = add/and event (PMC1 -> bits 3 & 7).
+ */
+static unsigned char direct_event_is_marked[0x60 >> 1] = {
+ 0, /* 00 */
+ 0, /* 02 */
+ 0, /* 04 */
+ 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+ 0x04, /* 08 PM_MRK_DFU_FIN */
+ 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
+ 0, /* 0c */
+ 0, /* 0e */
+ 0x02, /* 10 PM_MRK_INST_DISP */
+ 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
+ 0, /* 14 */
+ 0, /* 16 */
+ 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
+ 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
+ 0x01, /* 1c PM_MRK_INST_ISSUED */
+ 0, /* 1e */
+ 0, /* 20 */
+ 0, /* 22 */
+ 0, /* 24 */
+ 0, /* 26 */
+ 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
+ 0, /* 2a */
+ 0, /* 2c */
+ 0, /* 2e */
+ 0x4f, /* 30 */
+ 0x7f, /* 32 */
+ 0x4f, /* 34 */
+ 0x5f, /* 36 */
+ 0x6f, /* 38 */
+ 0x4f, /* 3a */
+ 0, /* 3c */
+ 0x08, /* 3e PM_MRK_INST_TIMEO */
+ 0x1f, /* 40 */
+ 0x1f, /* 42 */
+ 0x1f, /* 44 */
+ 0x1f, /* 46 */
+ 0x1f, /* 48 */
+ 0x1f, /* 4a */
+ 0x1f, /* 4c */
+ 0x1f, /* 4e */
+ 0, /* 50 */
+ 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
+ 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
+ 0x02, /* 56 PM_MRK_LD_MISS_L1 */
+ 0, /* 58 */
+ 0, /* 5a */
+ 0, /* 5c */
+ 0, /* 5e */
+};
+
+/*
+ * Masks showing for each unit which bits are marked events.
+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
+ */
+static u32 marked_bus_events[16] = {
+ 0x01000000, /* direct events set 1: byte 3 bit 0 */
+ 0x00010000, /* direct events set 2: byte 2 bit 0 */
+ 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
+ 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
+ 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
+ 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
+ 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
+ 0, /* LSU set 3 */
+ 0x00000010, /* VMX set 3: byte 0 bit 4 */
+ 0, /* BFP set 1 */
+ 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
+ 0, 0
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power6_marked_instr_event(u64 event)
+{
+ int pmc, psel, ptype;
+ int bit, byte, unit;
+ u32 mask;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
+ if (pmc >= 5)
+ return 0;
+
+ bit = -1;
+ if (psel < sizeof(direct_event_is_marked)) {
+ ptype = direct_event_is_marked[psel];
+ if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
+ return 0;
+ ptype >>= 4;
+ if (ptype == 0)
+ return 1;
+ if (ptype == 1)
+ bit = 0;
+ else
+ bit = ptype ^ (pmc - 1);
+ } else if ((psel & 0x48) == 0x40)
+ bit = psel & 7;
+
+ if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+ return 0;
+
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ mask = marked_bus_events[unit];
+ return (mask >> (byte * 8 + bit)) & 1;
+}
+
+/*
+ * Assign PMC numbers and compute MMCR1 value for a set of events
+ */
+static int p6_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr1 = 0;
+ u64 mmcra = 0;
+ int i;
+ unsigned int pmc, ev, b, u, s, psel;
+ unsigned int ttmset = 0;
+ unsigned int pmc_inuse = 0;
+
+ if (n_ev > 6)
+ return -1;
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1; /* collision! */
+ pmc_inuse |= 1 << (pmc - 1);
+ }
+ }
+ for (i = 0; i < n_ev; ++i) {
+ ev = event[i];
+ pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ --pmc;
+ } else {
+ /* can go on any PMC; find a free one */
+ for (pmc = 0; pmc < 4; ++pmc)
+ if (!(pmc_inuse & (1 << pmc)))
+ break;
+ if (pmc >= 4)
+ return -1;
+ pmc_inuse |= 1 << pmc;
+ }
+ hwc[i] = pmc;
+ psel = ev & PM_PMCSEL_MSK;
+ if (ev & PM_BUSEVENT_MSK) {
+ /* this event uses the event bus */
+ b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
+ u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
+ /* check for conflict on this byte of event bus */
+ if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
+ return -1;
+ mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+ ttmset |= 1 << b;
+ if (u == 5) {
+ /* Nest events have a further mux */
+ s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+ if ((ttmset & 0x10) &&
+ MMCR1_NESTSEL(mmcr1) != s)
+ return -1;
+ ttmset |= 0x10;
+ mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+ }
+ if (0x30 <= psel && psel <= 0x3d) {
+ /* these need the PMCx_ADDR_SEL bits */
+ if (b >= 2)
+ mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
+ }
+ /* bus select values are different for PMC3/4 */
+ if (pmc >= 2 && (psel & 0x90) == 0x80)
+ psel ^= 0x20;
+ }
+ if (ev & PM_LLA) {
+ mmcr1 |= MMCR1_PMC1_LLA >> pmc;
+ if (ev & PM_LLAV)
+ mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
+ }
+ if (power6_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ if (pmc < 4)
+ mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+ }
+ mmcr[0] = 0;
+ if (pmc_inuse & 1)
+ mmcr[0] = MMCR0_PMC1CE;
+ if (pmc_inuse & 0xe)
+ mmcr[0] |= MMCR0_PMCjCE;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ *
+ * 0-1 add field: number of uses of PMC1 (max 1)
+ * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
+ * 12-15 add field: number of uses of PMC1-4 (max 4)
+ * 16-19 select field: unit on byte 0 of event bus
+ * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ * 32-34 select field: nest (subunit) event selector
+ */
+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, byte, sh, subunit;
+ u64 mask = 0, value = 0;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ }
+ if (event & PM_BUSEVENT_MSK) {
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ sh = byte * 4 + (16 - PM_UNIT_SH);
+ mask |= PM_UNIT_MSKS << sh;
+ value |= (u64)(event & PM_UNIT_MSKS) << sh;
+ if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
+ subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+ mask |= (u64)PM_SUBUNIT_MSK << 32;
+ value |= (u64)subunit << 32;
+ }
+ }
+ if (pmc <= 4) {
+ mask |= 0x8000; /* add field for count of PMC1-4 uses */
+ value |= 0x1000;
+ }
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+static int p6_limited_pmc_event(u64 event)
+{
+ int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+ return pmc == 5 || pmc == 6;
+}
+
+#define MAX_ALT 4 /* at most 4 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+ { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
+ { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
+ { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
+ { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */
+ { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
+ { 0x10000e, 0x400010 }, /* PM_PURR */
+ { 0x100010, 0x4000f8 }, /* PM_FLUSH */
+ { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
+ { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
+ { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
+ { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
+ { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
+ { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
+ { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
+ { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
+ { 0x200012, 0x300012 }, /* PM_INST_DISP */
+ { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
+ { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
+ { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
+ { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
+ { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
+ { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
+ { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
+};
+
+/*
+ * This could be made more efficient with a binary search on
+ * a presorted list, if necessary
+ */
+static int find_alternatives_list(u64 event)
+{
+ int i, j;
+ unsigned int alt;
+
+ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+ if (event < event_alternatives[i][0])
+ return -1;
+ for (j = 0; j < MAX_ALT; ++j) {
+ alt = event_alternatives[i][j];
+ if (!alt || event < alt)
+ break;
+ if (event == alt)
+ return i;
+ }
+ }
+ return -1;
+}
+
+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ int i, j, nlim;
+ unsigned int psel, pmc;
+ unsigned int nalt = 1;
+ u64 aevent;
+
+ alt[0] = event;
+ nlim = p6_limited_pmc_event(event);
+
+ /* check the alternatives table */
+ i = find_alternatives_list(event);
+ if (i >= 0) {
+ /* copy out alternatives from list */
+ for (j = 0; j < MAX_ALT; ++j) {
+ aevent = event_alternatives[i][j];
+ if (!aevent)
+ break;
+ if (aevent != event)
+ alt[nalt++] = aevent;
+ nlim += p6_limited_pmc_event(aevent);
+ }
+
+ } else {
+ /* Check for alternative ways of computing sum events */
+ /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
+ psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc && (psel == 0x32 || psel == 0x34))
+ alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
+ ((5 - pmc) << PM_PMC_SH);
+
+ /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
+ if (pmc && (psel == 0x38 || psel == 0x3a))
+ alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
+ ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
+ }
+
+ if (flags & PPMU_ONLY_COUNT_RUN) {
+ /*
+ * We're only counting in RUN state,
+ * so PM_CYC is equivalent to PM_RUN_CYC,
+ * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
+ * This doesn't include alternatives that don't provide
+ * any extra flexibility in assigning PMCs (e.g.
+ * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
+ * Note that even with these additional alternatives
+ * we never end up with more than 4 alternatives for any event.
+ */
+ j = nalt;
+ for (i = 0; i < nalt; ++i) {
+ switch (alt[i]) {
+ case 0x1e: /* PM_CYC */
+ alt[j++] = 0x600005; /* PM_RUN_CYC */
+ ++nlim;
+ break;
+ case 0x10000a: /* PM_RUN_CYC */
+ alt[j++] = 0x1e; /* PM_CYC */
+ break;
+ case 2: /* PM_INST_CMPL */
+ alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
+ ++nlim;
+ break;
+ case 0x500009: /* PM_RUN_INST_CMPL */
+ alt[j++] = 2; /* PM_INST_CMPL */
+ break;
+ case 0x10000e: /* PM_PURR */
+ alt[j++] = 0x4000f4; /* PM_RUN_PURR */
+ break;
+ case 0x4000f4: /* PM_RUN_PURR */
+ alt[j++] = 0x10000e; /* PM_PURR */
+ break;
+ }
+ }
+ nalt = j;
+ }
+
+ if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+ /* remove the limited PMC events */
+ j = 0;
+ for (i = 0; i < nalt; ++i) {
+ if (!p6_limited_pmc_event(alt[i])) {
+ alt[j] = alt[i];
+ ++j;
+ }
+ }
+ nalt = j;
+ } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+ /* remove all but the limited PMC events */
+ j = 0;
+ for (i = 0; i < nalt; ++i) {
+ if (p6_limited_pmc_event(alt[i])) {
+ alt[j] = alt[i];
+ ++j;
+ }
+ }
+ nalt = j;
+ }
+
+ return nalt;
+}
+
+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ /* Set PMCxSEL to 0 to disable PMCx */
+ if (pmc <= 3)
+ mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power6_generic_events[] = {
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x1e,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 2,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
+ */
+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x80082, 0x80080 },
+ [C(OP_WRITE)] = { 0x80086, 0x80088 },
+ [C(OP_PREFETCH)] = { 0x810a4, 0 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x100056 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0x4008c, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x150730, 0x250532 },
+ [C(OP_WRITE)] = { 0x250432, 0x150432 },
+ [C(OP_PREFETCH)] = { 0x810a6, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x20000e },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x420ce },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x430e6, 0x400052 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu power6_pmu = {
+ .n_counter = 6,
+ .max_alternatives = MAX_ALT,
+ .add_fields = 0x1555,
+ .test_adder = 0x3000,
+ .compute_mmcr = p6_compute_mmcr,
+ .get_constraint = p6_get_constraint,
+ .get_alternatives = p6_get_alternatives,
+ .disable_pmc = p6_disable_pmc,
+ .limited_pmc_event = p6_limited_pmc_event,
+ .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
+ .n_generic = ARRAY_SIZE(power6_generic_events),
+ .generic_events = power6_generic_events,
+ .cache_events = &power6_cache_events,
+};
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
new file mode 100644
index 00000000000..b3f7d1216ba
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,357 @@
+/*
+ * Performance counter support for POWER7 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER7
+ */
+#define PM_PMC_SH 16 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0xf
+#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH 12 /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK 0xf
+#define PM_COMBINE_SH 11 /* Combined event bit */
+#define PM_COMBINE_MSK 1
+#define PM_COMBINE_MSKS 0x800
+#define PM_L2SEL_SH 8 /* L2 event select */
+#define PM_L2SEL_MSK 7
+#define PM_PMCSEL_MSK 0xff
+
+/*
+ * Bits in MMCR1 for POWER7
+ */
+#define MMCR1_TTM0SEL_SH 60
+#define MMCR1_TTM1SEL_SH 56
+#define MMCR1_TTM2SEL_SH 52
+#define MMCR1_TTM3SEL_SH 48
+#define MMCR1_TTMSEL_MSK 0xf
+#define MMCR1_L2SEL_SH 45
+#define MMCR1_L2SEL_MSK 7
+#define MMCR1_PMC1_COMBINE_SH 35
+#define MMCR1_PMC2_COMBINE_SH 34
+#define MMCR1_PMC3_COMBINE_SH 33
+#define MMCR1_PMC4_COMBINE_SH 32
+#define MMCR1_PMC1SEL_SH 24
+#define MMCR1_PMC2SEL_SH 16
+#define MMCR1_PMC3SEL_SH 8
+#define MMCR1_PMC4SEL_SH 0
+#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK 0xff
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * [ ><><><><><><>
+ * NC P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ * 15: NC error 0x8000
+ * 12-14: number of events needing PMC1-4 0x7000
+ *
+ * P6
+ * 11: P6 error 0x800
+ * 10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ * 0-9: Count of events needing PMC1..PMC5
+ */
+
+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, sh;
+ u64 mask = 0, value = 0;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
+ return -1;
+ }
+ if (pmc < 5) {
+ /* need a counter from PMC1-4 set */
+ mask |= 0x8000;
+ value |= 0x1000;
+ }
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+#define MAX_ALT 2 /* at most 2 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+ { 0x200f2, 0x300f2 }, /* PM_INST_DISP */
+ { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */
+ { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+ int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+ if (event < event_alternatives[i][0])
+ break;
+ for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+ if (event == event_alternatives[i][j])
+ return i;
+ }
+ return -1;
+}
+
+static s64 find_alternative_decode(u64 event)
+{
+ int pmc, psel;
+
+ /* this only handles the 4x decode events */
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = event & PM_PMCSEL_MSK;
+ if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
+ return event - (1 << PM_PMC_SH) + 8;
+ if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
+ return event + (1 << PM_PMC_SH) - 8;
+ return -1;
+}
+
+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ int i, j, nalt = 1;
+ s64 ae;
+
+ alt[0] = event;
+ nalt = 1;
+ i = find_alternative(event);
+ if (i >= 0) {
+ for (j = 0; j < MAX_ALT; ++j) {
+ ae = event_alternatives[i][j];
+ if (ae && ae != event)
+ alt[nalt++] = ae;
+ }
+ } else {
+ ae = find_alternative_decode(event);
+ if (ae > 0)
+ alt[nalt++] = ae;
+ }
+
+ if (flags & PPMU_ONLY_COUNT_RUN) {
+ /*
+ * We're only counting in RUN state,
+ * so PM_CYC is equivalent to PM_RUN_CYC
+ * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+ * This doesn't include alternatives that don't provide
+ * any extra flexibility in assigning PMCs.
+ */
+ j = nalt;
+ for (i = 0; i < nalt; ++i) {
+ switch (alt[i]) {
+ case 0x1e: /* PM_CYC */
+ alt[j++] = 0x600f4; /* PM_RUN_CYC */
+ break;
+ case 0x600f4: /* PM_RUN_CYC */
+ alt[j++] = 0x1e;
+ break;
+ case 0x2: /* PM_PPC_CMPL */
+ alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */
+ break;
+ case 0x500fa: /* PM_RUN_INST_CMPL */
+ alt[j++] = 0x2; /* PM_PPC_CMPL */
+ break;
+ }
+ }
+ nalt = j;
+ }
+
+ return nalt;
+}
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power7_marked_instr_event(u64 event)
+{
+ int pmc, psel;
+ int unit;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ psel = event & PM_PMCSEL_MSK & ~1; /* trim off edge/level bit */
+ if (pmc >= 5)
+ return 0;
+
+ switch (psel >> 4) {
+ case 2:
+ return pmc == 2 || pmc == 4;
+ case 3:
+ if (psel == 0x3c)
+ return pmc == 1;
+ if (psel == 0x3e)
+ return pmc != 2;
+ return 1;
+ case 4:
+ case 5:
+ return unit == 0xd;
+ case 6:
+ if (psel == 0x64)
+ return pmc >= 3;
+ case 8:
+ return unit == 0xd;
+ }
+ return 0;
+}
+
+static int power7_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr1 = 0;
+ u64 mmcra = 0;
+ unsigned int pmc, unit, combine, l2sel, psel;
+ unsigned int pmc_inuse = 0;
+ int i;
+
+ /* First pass to count resource use */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 6)
+ return -1;
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1;
+ pmc_inuse |= 1 << (pmc - 1);
+ }
+ }
+
+ /* Second pass: assign PMCs, set all MMCR1 fields */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
+ l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+ psel = event[i] & PM_PMCSEL_MSK;
+ if (!pmc) {
+ /* Bus event or any-PMC direct event */
+ for (pmc = 0; pmc < 4; ++pmc) {
+ if (!(pmc_inuse & (1 << pmc)))
+ break;
+ }
+ if (pmc >= 4)
+ return -1;
+ pmc_inuse |= 1 << pmc;
+ } else {
+ /* Direct or decoded event */
+ --pmc;
+ }
+ if (pmc <= 3) {
+ mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
+ mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+ mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+ if (unit == 6) /* L2 events */
+ mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+ }
+ if (power7_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ hwc[i] = pmc;
+ }
+
+ /* Return MMCRx values */
+ mmcr[0] = 0;
+ if (pmc_inuse & 1)
+ mmcr[0] = MMCR0_PMC1CE;
+ if (pmc_inuse & 0x3e)
+ mmcr[0] |= MMCR0_PMCjCE;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ if (pmc <= 3)
+ mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power7_generic_events[] = {
+ [PERF_COUNT_CPU_CYCLES] = 0x1e,
+ [PERF_COUNT_INSTRUCTIONS] = 2,
+ [PERF_COUNT_CACHE_REFERENCES] = 0xc880, /* LD_REF_L1_LSU */
+ [PERF_COUNT_CACHE_MISSES] = 0x400f0, /* LD_MISS_L1 */
+ [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068, /* BRU_FIN */
+ [PERF_COUNT_BRANCH_MISSES] = 0x400f6, /* BR_MPRED */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x400f0, 0xc880 },
+ [C(OP_WRITE)] = { 0, 0x300f0 },
+ [C(OP_PREFETCH)] = { 0xd8b8, 0 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x200fc },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0x408a, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x6080, 0x6084 },
+ [C(OP_WRITE)] = { 0x6082, 0x6086 },
+ [C(OP_PREFETCH)] = { 0, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x300fc },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x400fc },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x10068, 0x400f6 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu power7_pmu = {
+ .n_counter = 6,
+ .max_alternatives = MAX_ALT + 1,
+ .add_fields = 0x1555ull,
+ .test_adder = 0x3000ull,
+ .compute_mmcr = power7_compute_mmcr,
+ .get_constraint = power7_get_constraint,
+ .get_alternatives = power7_get_alternatives,
+ .disable_pmc = power7_disable_pmc,
+ .n_generic = ARRAY_SIZE(power7_generic_events),
+ .generic_events = power7_generic_events,
+ .cache_events = &power7_cache_events,
+};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 00000000000..ba0a357a89f
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,482 @@
+/*
+ * Performance counter support for PPC970-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for PPC970
+ */
+#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
+#define PM_PMC_MSK 0xf
+#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK 0xf
+#define PM_SPCSEL_SH 6
+#define PM_SPCSEL_MSK 3
+#define PM_BYTE_SH 4 /* Byte number of event bus to use */
+#define PM_BYTE_MSK 3
+#define PM_PMCSEL_MSK 0xf
+
+/* Values in PM_UNIT field */
+#define PM_NONE 0
+#define PM_FPU 1
+#define PM_VPU 2
+#define PM_ISU 3
+#define PM_IFU 4
+#define PM_IDU 5
+#define PM_STS 6
+#define PM_LSU0 7
+#define PM_LSU1U 8
+#define PM_LSU1L 9
+#define PM_LASTUNIT 9
+
+/*
+ * Bits in MMCR0 for PPC970
+ */
+#define MMCR0_PMC1SEL_SH 8
+#define MMCR0_PMC2SEL_SH 1
+#define MMCR_PMCSEL_MSK 0x1f
+
+/*
+ * Bits in MMCR1 for PPC970
+ */
+#define MMCR1_TTM0SEL_SH 62
+#define MMCR1_TTM1SEL_SH 59
+#define MMCR1_TTM3SEL_SH 53
+#define MMCR1_TTMSEL_MSK 3
+#define MMCR1_TD_CP_DBG0SEL_SH 50
+#define MMCR1_TD_CP_DBG1SEL_SH 48
+#define MMCR1_TD_CP_DBG2SEL_SH 46
+#define MMCR1_TD_CP_DBG3SEL_SH 44
+#define MMCR1_PMC1_ADDER_SEL_SH 39
+#define MMCR1_PMC2_ADDER_SEL_SH 38
+#define MMCR1_PMC6_ADDER_SEL_SH 37
+#define MMCR1_PMC5_ADDER_SEL_SH 36
+#define MMCR1_PMC8_ADDER_SEL_SH 35
+#define MMCR1_PMC7_ADDER_SEL_SH 34
+#define MMCR1_PMC3_ADDER_SEL_SH 33
+#define MMCR1_PMC4_ADDER_SEL_SH 32
+#define MMCR1_PMC3SEL_SH 27
+#define MMCR1_PMC4SEL_SH 22
+#define MMCR1_PMC5SEL_SH 17
+#define MMCR1_PMC6SEL_SH 12
+#define MMCR1_PMC7SEL_SH 7
+#define MMCR1_PMC8SEL_SH 2
+
+static short mmcr1_adder_bits[8] = {
+ MMCR1_PMC1_ADDER_SEL_SH,
+ MMCR1_PMC2_ADDER_SEL_SH,
+ MMCR1_PMC3_ADDER_SEL_SH,
+ MMCR1_PMC4_ADDER_SEL_SH,
+ MMCR1_PMC5_ADDER_SEL_SH,
+ MMCR1_PMC6_ADDER_SEL_SH,
+ MMCR1_PMC7_ADDER_SEL_SH,
+ MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
+ * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
+ *
+ * SP - SPCSEL constraint
+ * 48-49: SPCSEL value 0x3_0000_0000_0000
+ *
+ * T0 - TTM0 constraint
+ * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
+ * 43: UC3 error 0x0800_0000_0000
+ * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
+ * 41: ISU events needed 0x0200_0000_0000
+ * 40: IDU|STS events needed 0x0100_0000_0000
+ *
+ * PS1
+ * 39: PS1 error 0x0080_0000_0000
+ * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ * 35: PS2 error 0x0008_0000_0000
+ * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ * 28-31: Byte 0 event source 0xf000_0000
+ * Encoding as for the event code
+ *
+ * B1, B2, B3
+ * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P1
+ * 15: P1 error 0x8000
+ * 14-15: Count of events needing PMC1
+ *
+ * P2..P8
+ * 0-13: Count of events needing PMC2..PMC8
+ */
+
+static unsigned char direct_marked_event[8] = {
+ (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+ (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+ (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
+ (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+ (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
+ (1<<3) | (1<<4) | (1<<5),
+ /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+ (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+ (1<<4) /* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p970_marked_instr_event(u64 event)
+{
+ int pmc, psel, unit, byte, bit;
+ unsigned int mask;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ psel = event & PM_PMCSEL_MSK;
+ if (pmc) {
+ if (direct_marked_event[pmc - 1] & (1 << psel))
+ return 1;
+ if (psel == 0) /* add events */
+ bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+ else if (psel == 7 || psel == 13) /* decode events */
+ bit = 4;
+ else
+ return 0;
+ } else
+ bit = psel;
+
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ mask = 0;
+ switch (unit) {
+ case PM_VPU:
+ mask = 0x4c; /* byte 0 bits 2,3,6 */
+ case PM_LSU0:
+ /* byte 2 bits 0,2,3,4,6; all of byte 1 */
+ mask = 0x085dff00;
+ case PM_LSU1L:
+ mask = 0x50 << 24; /* byte 3 bits 4,6 */
+ break;
+ }
+ return (mask >> (byte * 8 + bit)) & 1;
+}
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+ [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
+ [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
+ [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
+ [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
+ [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
+ [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
+};
+
+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+ int pmc, byte, unit, sh, spcsel;
+ u64 mask = 0, value = 0;
+ int grp = -1;
+
+ pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc > 8)
+ return -1;
+ sh = (pmc - 1) * 2;
+ mask |= 2 << sh;
+ value |= 1 << sh;
+ grp = ((pmc - 1) >> 1) & 1;
+ }
+ unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+ if (unit) {
+ if (unit > PM_LASTUNIT)
+ return -1;
+ mask |= unit_cons[unit][0];
+ value |= unit_cons[unit][1];
+ byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+ /*
+ * Bus events on bytes 0 and 2 can be counted
+ * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+ */
+ if (!pmc)
+ grp = byte & 1;
+ /* Set byte lane select field */
+ mask |= 0xfULL << (28 - 4 * byte);
+ value |= (u64)unit << (28 - 4 * byte);
+ }
+ if (grp == 0) {
+ /* increment PMC1/2/5/6 field */
+ mask |= 0x8000000000ull;
+ value |= 0x1000000000ull;
+ } else if (grp == 1) {
+ /* increment PMC3/4/7/8 field */
+ mask |= 0x800000000ull;
+ value |= 0x100000000ull;
+ }
+ spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+ if (spcsel) {
+ mask |= 3ull << 48;
+ value |= (u64)spcsel << 48;
+ }
+ *maskp = mask;
+ *valp = value;
+ return 0;
+}
+
+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+ alt[0] = event;
+
+ /* 2 alternatives for LSU empty */
+ if (event == 0x2002 || event == 0x3002) {
+ alt[1] = event ^ 0x1000;
+ return 2;
+ }
+
+ return 1;
+}
+
+static int p970_compute_mmcr(u64 event[], int n_ev,
+ unsigned int hwc[], u64 mmcr[])
+{
+ u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+ unsigned int pmc, unit, byte, psel;
+ unsigned int ttm, grp;
+ unsigned int pmc_inuse = 0;
+ unsigned int pmc_grp_use[2];
+ unsigned char busbyte[4];
+ unsigned char unituse[16];
+ unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
+ unsigned char ttmuse[2];
+ unsigned char pmcsel[8];
+ int i;
+ int spcsel;
+
+ if (n_ev > 8)
+ return -1;
+
+ /* First pass to count resource use */
+ pmc_grp_use[0] = pmc_grp_use[1] = 0;
+ memset(busbyte, 0, sizeof(busbyte));
+ memset(unituse, 0, sizeof(unituse));
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ if (pmc) {
+ if (pmc_inuse & (1 << (pmc - 1)))
+ return -1;
+ pmc_inuse |= 1 << (pmc - 1);
+ /* count 1/2/5/6 vs 3/4/7/8 use */
+ ++pmc_grp_use[((pmc - 1) >> 1) & 1];
+ }
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ if (unit) {
+ if (unit > PM_LASTUNIT)
+ return -1;
+ if (!pmc)
+ ++pmc_grp_use[byte & 1];
+ if (busbyte[byte] && busbyte[byte] != unit)
+ return -1;
+ busbyte[byte] = unit;
+ unituse[unit] = 1;
+ }
+ }
+ if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+ return -1;
+
+ /*
+ * Assign resources and set multiplexer selects.
+ *
+ * PM_ISU can go either on TTM0 or TTM1, but that's the only
+ * choice we have to deal with.
+ */
+ if (unituse[PM_ISU] &
+ (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
+ unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
+ /* Set TTM[01]SEL fields. */
+ ttmuse[0] = ttmuse[1] = 0;
+ for (i = PM_FPU; i <= PM_STS; ++i) {
+ if (!unituse[i])
+ continue;
+ ttm = unitmap[i];
+ ++ttmuse[(ttm >> 2) & 1];
+ mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+ }
+ /* Check only one unit per TTMx */
+ if (ttmuse[0] > 1 || ttmuse[1] > 1)
+ return -1;
+
+ /* Set byte lane select fields and TTM3SEL. */
+ for (byte = 0; byte < 4; ++byte) {
+ unit = busbyte[byte];
+ if (!unit)
+ continue;
+ if (unit <= PM_STS)
+ ttm = (unitmap[unit] >> 2) & 1;
+ else if (unit == PM_LSU0)
+ ttm = 2;
+ else {
+ ttm = 3;
+ if (unit == PM_LSU1L && byte >= 2)
+ mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+ }
+ mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+ }
+
+ /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+ memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
+ for (i = 0; i < n_ev; ++i) {
+ pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+ unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+ byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+ psel = event[i] & PM_PMCSEL_MSK;
+ if (!pmc) {
+ /* Bus event or any-PMC direct event */
+ if (unit)
+ psel |= 0x10 | ((byte & 2) << 2);
+ else
+ psel |= 8;
+ for (pmc = 0; pmc < 8; ++pmc) {
+ if (pmc_inuse & (1 << pmc))
+ continue;
+ grp = (pmc >> 1) & 1;
+ if (unit) {
+ if (grp == (byte & 1))
+ break;
+ } else if (pmc_grp_use[grp] < 4) {
+ ++pmc_grp_use[grp];
+ break;
+ }
+ }
+ pmc_inuse |= 1 << pmc;
+ } else {
+ /* Direct event */
+ --pmc;
+ if (psel == 0 && (byte & 2))
+ /* add events on higher-numbered bus */
+ mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+ }
+ pmcsel[pmc] = psel;
+ hwc[i] = pmc;
+ spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+ mmcr1 |= spcsel;
+ if (p970_marked_instr_event(event[i]))
+ mmcra |= MMCRA_SAMPLE_ENABLE;
+ }
+ for (pmc = 0; pmc < 2; ++pmc)
+ mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
+ for (; pmc < 8; ++pmc)
+ mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+ if (pmc_inuse & 1)
+ mmcr0 |= MMCR0_PMC1CE;
+ if (pmc_inuse & 0xfe)
+ mmcr0 |= MMCR0_PMCjCE;
+
+ mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
+
+ /* Return MMCRx values */
+ mmcr[0] = mmcr0;
+ mmcr[1] = mmcr1;
+ mmcr[2] = mmcra;
+ return 0;
+}
+
+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+ int shift, i;
+
+ if (pmc <= 1) {
+ shift = MMCR0_PMC1SEL_SH - 7 * pmc;
+ i = 0;
+ } else {
+ shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
+ i = 1;
+ }
+ /*
+ * Setting the PMCxSEL field to 0x08 disables PMC x.
+ */
+ mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
+}
+
+static int ppc970_generic_events[] = {
+ [PERF_COUNT_HW_CPU_CYCLES] = 7,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 1,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
+};
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+ [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x8810, 0x3810 },
+ [C(OP_WRITE)] = { 0x7810, 0x813 },
+ [C(OP_PREFETCH)] = { 0x731, 0 },
+ },
+ [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { 0, 0 },
+ },
+ [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0 },
+ [C(OP_WRITE)] = { 0, 0 },
+ [C(OP_PREFETCH)] = { 0x733, 0 },
+ },
+ [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x704 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(ITLB)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0, 0x700 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+ [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */
+ [C(OP_READ)] = { 0x431, 0x327 },
+ [C(OP_WRITE)] = { -1, -1 },
+ [C(OP_PREFETCH)] = { -1, -1 },
+ },
+};
+
+struct power_pmu ppc970_pmu = {
+ .n_counter = 8,
+ .max_alternatives = 2,
+ .add_fields = 0x001100005555ull,
+ .test_adder = 0x013300000000ull,
+ .compute_mmcr = p970_compute_mmcr,
+ .get_constraint = p970_get_constraint,
+ .get_alternatives = p970_get_alternatives,
+ .disable_pmc = p970_disable_pmc,
+ .n_generic = ARRAY_SIZE(ppc970_generic_events),
+ .generic_events = ppc970_generic_events,
+ .cache_events = &ppc970_cache_events,
+};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac..5beffc8f481 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/perf_counter.h>
#include <asm/firmware.h>
#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
die("Weird page fault", regs, SIGSEGV);
}
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
}
if (ret & VM_FAULT_MAJOR) {
current->maj_flt++;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, address);
#ifdef CONFIG_PPC_SMLPAR
if (firmware_has_feature(FW_FEATURE_CMO)) {
preempt_disable();
@@ -316,8 +321,11 @@ good_area:
preempt_enable();
}
#endif
- } else
+ } else {
current->min_flt++;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, address);
+ }
up_read(&mm->mmap_sem);
return 0;
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e4933..732ee93a8e9 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
config PPC64
bool "64-bit kernel"
default n
+ select HAVE_PERF_COUNTERS
help
This option selects whether a 32-bit or a 64-bit kernel
will be built.
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 80b513449f4..be3581a8c29 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
lpar_xirr_info_set((0xff << 24) | irq);
}
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
{
unsigned int irq;
int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
irq = (unsigned int)irq_map[virq].hwirq;
if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
- return;
+ return -1;
status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
if (status) {
printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
__func__, irq, status);
- return;
+ return -1;
}
/*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
printk(KERN_WARNING
"%s: No online cpus in the mask %s for irq %d\n",
__func__, cpulist, virq);
- return;
+ return -1;
}
status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
if (status) {
printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
__func__, irq, status);
- return;
+ return -1;
}
+
+ return 0;
}
static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 0efc12d1a3d..352d8c3ef52 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
#endif /* CONFIG_SMP */
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct mpic *mpic = mpic_from_irq(irq);
unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
mpic_physmask(cpus_addr(tmp)[0]));
}
+
+ return 0;
}
static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 3cef2af10f4..eff433c322a 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
#endif /* _POWERPC_SYSDEV_MPIC_H */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 639ac805448..65865726b28 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,8 +102,8 @@ struct thread_info {
#define TI_KERN_CNTD1 0x00000488
#define TI_PCR 0x00000490
#define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS 0x000004c0
-#define TI_KUNA_INSN 0x000004c8
+#define TI_KUNA_REGS 0x000004c8
+#define TI_KUNA_INSN 0x000004d0
#define TI_FPREGS 0x00000500
/* We embed this in the uppermost byte of thread_info->flags */
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe921a4..e5e78f9cfc9 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
}
}
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
const struct cpumask *mask)
{
sun4u_irq_enable(virt_irq);
+
+ return 0;
}
/* Don't do anything. The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
ino, err);
}
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
const struct cpumask *mask)
{
unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
if (err != HV_EOK)
printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
"err(%d)\n", ino, cpuid, err);
+
+ return 0;
}
static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
dev_handle, dev_ino, err);
}
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
const struct cpumask *mask)
{
unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
"err(%d)\n",
dev_handle, dev_ino, cpuid, err);
+
+ return 0;
}
static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 00000000000..ad8ec356fb3
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a6efe0a2e9a..68f5578fe38 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+config OUTPUT_FORMAT
+ string
+ default "elf32-i386" if X86_32
+ default "elf64-x86-64" if X86_64
+
config ARCH_DEFCONFIG
string
default "arch/x86/configs/i386_defconfig" if X86_32
@@ -274,15 +279,9 @@ config SPARSE_IRQ
If you don't know what to do here, say N.
-config NUMA_MIGRATE_IRQ_DESC
- bool "Move irq desc when changing irq smp_affinity"
+config NUMA_IRQ_DESC
+ def_bool y
depends on SPARSE_IRQ && NUMA
- depends on BROKEN
- default n
- ---help---
- This enables moving irq_desc to cpu/node that irq will use handled.
-
- If you don't know what to do here, say N.
config X86_MPPARSE
bool "Enable MPS table" if ACPI
@@ -355,7 +354,7 @@ config X86_UV
depends on X86_64
depends on X86_EXTENDED_PLATFORM
depends on NUMA
- select X86_X2APIC
+ depends on X86_X2APIC
---help---
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
@@ -740,6 +739,7 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
+ select HAVE_PERF_COUNTERS if (!M386 && !M486)
config X86_IO_APIC
def_bool y
@@ -1466,9 +1466,7 @@ config KEXEC_JUMP
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
- default "0x1000000" if X86_NUMAQ
- default "0x200000" if X86_64
- default "0x100000"
+ default "0x1000000"
---help---
This gives the physical address where the kernel is loaded.
@@ -1487,15 +1485,15 @@ config PHYSICAL_START
to be specifically compiled to run from a specific memory area
(normally a reserved region) and this option comes handy.
- So if you are using bzImage for capturing the crash dump, leave
- the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
- Otherwise if you plan to use vmlinux for capturing the crash dump
- change this value to start of the reserved region (Typically 16MB
- 0x1000000). In other words, it can be set based on the "X" value as
- specified in the "crashkernel=YM@XM" command line boot parameter
- passed to the panic-ed kernel. Typically this parameter is set as
- crashkernel=64M@16M. Please take a look at
- Documentation/kdump/kdump.txt for more details about crash dumps.
+ So if you are using bzImage for capturing the crash dump,
+ leave the value here unchanged to 0x1000000 and set
+ CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux
+ for capturing the crash dump change this value to start of
+ the reserved region. In other words, it can be set based on
+ the "X" value as specified in the "crashkernel=YM@XM"
+ command line boot parameter passed to the panic-ed
+ kernel. Please take a look at Documentation/kdump/kdump.txt
+ for more details about crash dumps.
Usage of bzImage for capturing the crash dump is recommended as
one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1506,8 @@ config PHYSICAL_START
Don't change this unless you know what you are doing.
config RELOCATABLE
- bool "Build a relocatable kernel (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "Build a relocatable kernel"
+ default y
---help---
This builds a kernel image that retains relocation information
so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1522,16 @@ config RELOCATABLE
it has been loaded at and the compile time physical address
(CONFIG_PHYSICAL_START) is ignored.
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+ def_bool y
+ depends on X86_32 && RELOCATABLE
+
config PHYSICAL_ALIGN
hex
prompt "Alignment value to which kernel should be aligned" if X86_32
- default "0x100000" if X86_32
- default "0x200000" if X86_64
- range 0x2000 0x400000
+ default "0x1000000"
+ range 0x2000 0x1000000
---help---
This value puts the alignment restrictions on physical address
where kernel is loaded and run from. Kernel is compiled for an
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317..d105f29bb6b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
options. See Documentation/x86_64/boot-options.txt for more
details.
+config IOMMU_STRESS
+ bool "Enable IOMMU stress-test mode"
+ ---help---
+ This option disables various optimizations in IOMMU related
+ code to do real stress testing of the IOMMU code. This option
+ will cause a performance drop and should only be enabled for
+ testing.
+
config IOMMU_LEAK
bool "IOMMU leak tracing"
- depends on DEBUG_KERNEL
- depends on IOMMU_DEBUG
+ depends on IOMMU_DEBUG && DMA_API_DEBUG
---help---
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
+config X86_DS_SELFTEST
+ bool "DS selftest"
+ default y
+ depends on DEBUG_KERNEL
+ depends on X86_DS
+ ---help---
+ Perform Debug Store selftests at boot time.
+ If in doubt, say "N".
+
config HAVE_MMIOTRACE_SUPPORT
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 8c86b72afdc..edbd0ca6206 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
KBUILD_DEFCONFIG := $(ARCH)_defconfig
endif
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
libs-y += arch/x86/lib/
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bd..851fe936d24 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
cpustr.h
mkcpustr
offsets.h
+voffset.h
+zoffset.h
setup
setup.bin
setup.elf
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505..8d16ada2504 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage
targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
-setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o string.o tty.o video.o video-mode.o version.o
+setup-y += printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y += version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
# The link order of the video-*.o modules can matter. In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-offsets := -e 's/^00*/0/' \
- -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
-quiet_cmd_offsets = OFFSETS $@
- cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+ cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
- $(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+ $(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,zoffset)
-targets += offsets.h
AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
LDFLAGS_setup.elf := -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c244..64a31a6d751 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- * Copyright 2009 Intel Corporation
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
static void enable_a20_bios(void)
{
- asm volatile("pushfl; int $0x15; popfl"
- : : "a" ((u16)0x2401));
+ struct biosregs ireg;
+
+ initregs(&ireg);
+ ireg.ax = 0x2401;
+ intcall(0x15, &ireg, NULL);
}
static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f..ee274834ea8 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* Original APM BIOS checking by Stephen Rothwell, May 1994
* (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
int query_apm_bios(void)
{
- u16 ax, bx, cx, dx, di;
- u32 ebx, esi;
- u8 err;
+ struct biosregs ireg, oreg;
/* APM BIOS installation check */
- ax = 0x5300;
- bx = cx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
- : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
- : : "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x53;
+ intcall(0x15, &ireg, &oreg);
- if (err)
+ if (oreg.flags & X86_EFLAGS_CF)
return -1; /* No APM BIOS */
- if (bx != 0x504d) /* "PM" signature */
+ if (oreg.bx != 0x504d) /* "PM" signature */
return -1;
- if (!(cx & 0x02)) /* 32 bits supported? */
+ if (!(oreg.cx & 0x02)) /* 32 bits supported? */
return -1;
/* Disconnect first, just in case */
- ax = 0x5304;
- bx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
-
- /* Paranoia */
- ebx = esi = 0;
- cx = dx = di = 0;
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
/* 32-bit connect */
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
- : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
- "+S" (esi), "+D" (di), "=m" (err)
- : "a" (0x5303));
-
- boot_params.apm_bios_info.cseg = ax;
- boot_params.apm_bios_info.offset = ebx;
- boot_params.apm_bios_info.cseg_16 = cx;
- boot_params.apm_bios_info.dseg = dx;
- boot_params.apm_bios_info.cseg_len = (u16)esi;
- boot_params.apm_bios_info.cseg_16_len = esi >> 16;
- boot_params.apm_bios_info.dseg_len = di;
-
- if (err)
+ ireg.al = 0x03;
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.apm_bios_info.cseg = oreg.ax;
+ boot_params.apm_bios_info.offset = oreg.ebx;
+ boot_params.apm_bios_info.cseg_16 = oreg.cx;
+ boot_params.apm_bios_info.dseg = oreg.dx;
+ boot_params.apm_bios_info.cseg_len = oreg.si;
+ boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+ boot_params.apm_bios_info.dseg_len = oreg.di;
+
+ if (oreg.flags & X86_EFLAGS_CF)
return -1;
/* Redo the installation check as the 32-bit connect;
some BIOSes return different flags this way... */
- ax = 0x5300;
- bx = cx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
- : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
- : : "esi", "edi");
+ ireg.al = 0x00;
+ intcall(0x15, &ireg, &oreg);
- if (err || bx != 0x504d) {
+ if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
/* Failure with 32-bit connect, try to disconect and ignore */
- ax = 0x5304;
- bx = 0;
- asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
+ ireg.al = 0x04;
+ intcall(0x15, &ireg, NULL);
return -1;
}
- boot_params.apm_bios_info.version = ax;
- boot_params.apm_bios_info.flags = cx;
+ boot_params.apm_bios_info.version = oreg.ax;
+ boot_params.apm_bios_info.flags = oreg.cx;
return 0;
}
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 00000000000..507793739ea
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes
+ * touching registers they shouldn't be.
+ */
+
+ .code16
+ .text
+ .globl intcall
+ .type intcall, @function
+intcall:
+ /* Self-modify the INT instruction. Ugly, but works. */
+ cmpb %al, 3f
+ je 1f
+ movb %al, 3f
+ jmp 1f /* Synchronize pipeline */
+1:
+ /* Save state */
+ pushfl
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Copy input state to stack frame */
+ subw $44, %sp
+ movw %dx, %si
+ movw %sp, %di
+ movw $11, %cx
+ rep; movsd
+
+ /* Pop full state from the stack */
+ popal
+ popw %gs
+ popw %fs
+ popw %es
+ popw %ds
+ popfl
+
+ /* Actual INT */
+ .byte 0xcd /* INT opcode */
+3: .byte 0
+
+ /* Push full state to the stack */
+ pushfl
+ pushw %ds
+ pushw %es
+ pushw %fs
+ pushw %gs
+ pushal
+
+ /* Re-establish C environment invariants */
+ cld
+ movzwl %sp, %esp
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+
+ /* Copy output state from stack frame */
+ movw 68(%esp), %di /* Original %cx == 3rd argument */
+ andw %di, %di
+ jz 4f
+ movw %sp, %si
+ movw $11, %cx
+ rep; movsd
+4: addw $44, %sp
+
+ /* Restore state and return */
+ popal
+ popw %gs
+ popw %fs
+ popfl
+ retl
+ .size intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e..98239d2658f 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
#include <asm/setup.h>
#include "bitops.h"
#include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
/* Useful macros */
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
/* apm.c */
int query_apm_bios(void);
+/* bioscall.c */
+struct biosregs {
+ union {
+ struct {
+ u32 edi;
+ u32 esi;
+ u32 ebp;
+ u32 _esp;
+ u32 ebx;
+ u32 edx;
+ u32 ecx;
+ u32 eax;
+ u32 _fsgs;
+ u32 _dses;
+ u32 eflags;
+ };
+ struct {
+ u16 di, hdi;
+ u16 si, hsi;
+ u16 bp, hbp;
+ u16 _sp, _hsp;
+ u16 bx, hbx;
+ u16 dx, hdx;
+ u16 cx, hcx;
+ u16 ax, hax;
+ u16 gs, fs;
+ u16 es, ds;
+ u16 flags, hflags;
+ };
+ struct {
+ u8 dil, dih, edi2, edi3;
+ u8 sil, sih, esi2, esi3;
+ u8 bpl, bph, ebp2, ebp3;
+ u8 _spl, _sph, _esp2, _esp3;
+ u8 bl, bh, ebx2, ebx3;
+ u8 dl, dh, edx2, edx3;
+ u8 cl, ch, ecx2, ecx3;
+ u8 al, ah, eax2, eax3;
+ };
+ };
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
/* cmdline.c */
int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
int vsprintf(char *buf, const char *fmt, va_list args);
int printf(const char *fmt, ...);
+/* regs.c */
+void initregs(struct biosregs *regs);
+
/* string.c */
int strcmp(const char *str1, const char *str2);
size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d0..4a46fab7162 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
relocs
vmlinux.bin.all
vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f857..49c8a4c37d7 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,9 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
LDFLAGS := -m elf_$(UTS_MACHINE)
LDFLAGS_vmlinux := -T
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+hostprogs-y := mkpiggy
+
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
$(call if_changed,ld)
@:
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
quiet_cmd_relocs = RELOCS $@
cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
$(call if_changed,relocs)
vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD $@
- cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
- $(call if_changed,relocbin)
-
-ifeq ($(CONFIG_X86_32),y)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
- $(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
-else
+suffix-$(CONFIG_KERNEL_GZIP) := gz
+suffix-$(CONFIG_KERNEL_BZIP2) := bz2
+suffix-$(CONFIG_KERNEL_LZMA) := lzma
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
- $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
- $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
- $(call if_changed,lzma)
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
+quiet_cmd_mkpiggy = MKPIGGY $@
+ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
-suffix_$(CONFIG_KERNEL_GZIP) = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA) = lzma
-
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
- $(call if_changed,ld)
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+ $(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e..75e4f001e70 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
* the page directory. [According to comments etc elsewhere on a compressed
* kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
*
- * Page 0 is deliberately kept safe, since System Management Mode code in
+ * Page 0 is deliberately kept safe, since System Management Mode code in
* laptops may need to access the BIOS data stored there. This is also
- * useful for future device drivers that either access the BIOS via VM86
+ * useful for future device drivers that either access the BIOS via VM86
* mode.
*/
/*
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-.text
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
@@ -29,161 +29,151 @@
#include <asm/boot.h>
#include <asm/asm-offsets.h>
-.section ".text.head","ax",@progbits
+ .section ".text.head","ax",@progbits
ENTRY(startup_32)
cld
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
- jnz 1f
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
+ testb $(1<<6), BP_loadflags(%esi)
+ jnz 1f
cli
- movl $(__BOOT_DS),%eax
- movl %eax,%ds
- movl %eax,%es
- movl %eax,%fs
- movl %eax,%gs
- movl %eax,%ss
+ movl $__BOOT_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
1:
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
* data at 0x1e4 (defined as a scratch field) are used as the stack
* for this calculation. Only 4 bytes are needed.
*/
- leal (0x1e4+4)(%esi), %esp
- call 1f
-1: popl %ebp
- subl $1b, %ebp
+ leal (BP_scratch+4)(%esi), %esp
+ call 1f
+1: popl %ebp
+ subl $1b, %ebp
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
* contains the address where we should move the kernel image temporarily
* for safe in-place decompression.
*/
#ifdef CONFIG_RELOCATABLE
- movl %ebp, %ebx
- addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
- andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+ movl %ebp, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
#else
- movl $LOAD_PHYSICAL_ADDR, %ebx
+ movl $LOAD_PHYSICAL_ADDR, %ebx
#endif
- /* Replace the compressed data size with the uncompressed size */
- subl input_len(%ebp), %ebx
- movl output_len(%ebp), %eax
- addl %eax, %ebx
- /* Add 8 bytes for every 32K input block */
- shrl $12, %eax
- addl %eax, %ebx
- /* Add 32K + 18 bytes of extra slack */
- addl $(32768 + 18), %ebx
- /* Align on a 4K boundary */
- addl $4095, %ebx
- andl $~4095, %ebx
-
-/* Copy the compressed kernel to the end of our buffer
+ /* Target address to relocate to for decompression */
+ addl $z_extract_offset, %ebx
+
+ /* Set up the stack */
+ leal boot_stack_end(%ebx), %esp
+
+ /* Zero EFLAGS */
+ pushl $0
+ popfl
+
+/*
+ * Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- pushl %esi
- leal _end(%ebp), %esi
- leal _end(%ebx), %edi
- movl $(_end - startup_32), %ecx
+ pushl %esi
+ leal (_bss-4)(%ebp), %esi
+ leal (_bss-4)(%ebx), %edi
+ movl $(_bss - startup_32), %ecx
+ shrl $2, %ecx
std
- rep
- movsb
+ rep movsl
cld
- popl %esi
-
-/* Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
- addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
- andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
- movl $LOAD_PHYSICAL_ADDR, %ebp
-#endif
+ popl %esi
/*
* Jump to the relocated address.
*/
- leal relocated(%ebx), %eax
- jmp *%eax
+ leal relocated(%ebx), %eax
+ jmp *%eax
ENDPROC(startup_32)
-.section ".text"
+ .text
relocated:
/*
- * Clear BSS
- */
- xorl %eax,%eax
- leal _edata(%ebx),%edi
- leal _end(%ebx), %ecx
- subl %edi,%ecx
- cld
- rep
- stosb
-
-/*
- * Setup the stack for the decompressor
+ * Clear BSS (stack is currently empty)
*/
- leal boot_stack_end(%ebx), %esp
+ xorl %eax, %eax
+ leal _bss(%ebx), %edi
+ leal _ebss(%ebx), %ecx
+ subl %edi, %ecx
+ shrl $2, %ecx
+ rep stosl
/*
* Do the decompression, and jump to the new kernel..
*/
- movl output_len(%ebx), %eax
- pushl %eax
- # push arguments for decompress_kernel:
- pushl %ebp # output address
- movl input_len(%ebx), %eax
- pushl %eax # input_len
- leal input_data(%ebx), %eax
- pushl %eax # input_data
- leal boot_heap(%ebx), %eax
- pushl %eax # heap area
- pushl %esi # real mode pointer
- call decompress_kernel
- addl $20, %esp
- popl %ecx
+ leal z_extract_offset_negative(%ebx), %ebp
+ /* push arguments for decompress_kernel: */
+ pushl %ebp /* output address */
+ pushl $z_input_len /* input_len */
+ leal input_data(%ebx), %eax
+ pushl %eax /* input_data */
+ leal boot_heap(%ebx), %eax
+ pushl %eax /* heap area */
+ pushl %esi /* real mode pointer */
+ call decompress_kernel
+ addl $20, %esp
#if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
*/
- movl %ebp, %edi
- addl %ecx, %edi
+ leal z_output_len(%ebp), %edi
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
* and where it was actually loaded.
*/
- movl %ebp, %ebx
- subl $LOAD_PHYSICAL_ADDR, %ebx
- jz 2f /* Nothing to be done if loaded at compiled addr. */
+ movl %ebp, %ebx
+ subl $LOAD_PHYSICAL_ADDR, %ebx
+ jz 2f /* Nothing to be done if loaded at compiled addr. */
/*
* Process relocations.
*/
-1: subl $4, %edi
- movl 0(%edi), %ecx
- testl %ecx, %ecx
- jz 2f
- addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
- jmp 1b
+1: subl $4, %edi
+ movl (%edi), %ecx
+ testl %ecx, %ecx
+ jz 2f
+ addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+ jmp 1b
2:
#endif
/*
* Jump to the decompressed kernel.
*/
- xorl %ebx,%ebx
- jmp *%ebp
+ xorl %ebx, %ebx
+ jmp *%ebp
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a8294800..f62c284db9e 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
/*
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
-.code32
-.text
+ .code32
+ .text
#include <linux/linkage.h>
#include <asm/segment.h>
@@ -33,12 +33,14 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
-.section ".text.head"
+ .section ".text.head"
.code32
ENTRY(startup_32)
cld
- /* test KEEP_SEGMENTS flag to see if the bootloader is asking
- * us to not reload segments */
+ /*
+ * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+ * us to not reload segments
+ */
testb $(1<<6), BP_loadflags(%esi)
jnz 1f
@@ -49,14 +51,15 @@ ENTRY(startup_32)
movl %eax, %ss
1:
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
* at and where we were actually loaded at. This can only be done
* with a short local call on x86. Nothing else will tell us what
* address we are running at. The reserved chunk of the real-mode
* data at 0x1e4 (defined as a scratch field) are used as the stack
* for this calculation. Only 4 bytes are needed.
*/
- leal (0x1e4+4)(%esi), %esp
+ leal (BP_scratch+4)(%esi), %esp
call 1f
1: popl %ebp
subl $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
testl %eax, %eax
jnz no_longmode
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
* and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
* contains the address where we should move the kernel image temporarily
* for safe in-place decompression.
*/
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
- addl $(PMD_PAGE_SIZE -1), %ebx
- andl $PMD_PAGE_MASK, %ebx
+ movl BP_kernel_alignment(%esi), %eax
+ decl %eax
+ addl %eax, %ebx
+ notl %eax
+ andl %eax, %ebx
#else
- movl $CONFIG_PHYSICAL_START, %ebx
+ movl $LOAD_PHYSICAL_ADDR, %ebx
#endif
- /* Replace the compressed data size with the uncompressed size */
- subl input_len(%ebp), %ebx
- movl output_len(%ebp), %eax
- addl %eax, %ebx
- /* Add 8 bytes for every 32K input block */
- shrl $12, %eax
- addl %eax, %ebx
- /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
- addl $(32768 + 18 + 4095), %ebx
- andl $~4095, %ebx
+ /* Target address to relocate to for decompression */
+ addl $z_extract_offset, %ebx
/*
* Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
/*
* Build early 4G boot pagetable
*/
- /* Initialize Page tables to 0*/
+ /* Initialize Page tables to 0 */
leal pgtable(%ebx), %edi
xorl %eax, %eax
movl $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
btsl $_EFER_LME, %eax
wrmsr
- /* Setup for the jump to 64bit mode
+ /*
+ * Setup for the jump to 64bit mode
*
* When the jump is performend we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
#include "../../kernel/verify_cpu_64.S"
- /* Be careful here startup_64 needs to be at a predictable
+ /*
+ * Be careful here startup_64 needs to be at a predictable
* address so I can export it in an ELF header. Bootloaders
* should look at the ELF header to find this address, as
* it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
.code64
.org 0x200
ENTRY(startup_64)
- /* We come here either from startup_32 or directly from a
+ /*
+ * We come here either from startup_32 or directly from a
* 64bit bootloader. If we come here from a bootloader we depend on
* an identity mapped page table being provied that maps our
* entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
movl $0x20, %eax
ltr %ax
- /* Compute the decompressed kernel start address. It is where
+ /*
+ * Compute the decompressed kernel start address. It is where
* we were loaded at aligned to a 2M boundary. %rbp contains the
* decompressed kernel start address.
*
* If it is a relocatable kernel then decompress and run the kernel
* from load address aligned to 2MB addr, otherwise decompress and
- * run the kernel from CONFIG_PHYSICAL_START
+ * run the kernel from LOAD_PHYSICAL_ADDR
+ *
+ * We cannot rely on the calculation done in 32-bit mode, since we
+ * may have been invoked via the 64-bit entry point.
*/
/* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
- addq $(PMD_PAGE_SIZE - 1), %rbp
- andq $PMD_PAGE_MASK, %rbp
- movq %rbp, %rbx
+ movl BP_kernel_alignment(%rsi), %eax
+ decl %eax
+ addq %rax, %rbp
+ notq %rax
+ andq %rax, %rbp
#else
- movq $CONFIG_PHYSICAL_START, %rbp
- movq %rbp, %rbx
+ movq $LOAD_PHYSICAL_ADDR, %rbp
#endif
- /* Replace the compressed data size with the uncompressed size */
- movl input_len(%rip), %eax
- subq %rax, %rbx
- movl output_len(%rip), %eax
- addq %rax, %rbx
- /* Add 8 bytes for every 32K input block */
- shrq $12, %rax
- addq %rax, %rbx
- /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
- addq $(32768 + 18 + 4095), %rbx
- andq $~4095, %rbx
-
-/* Copy the compressed kernel to the end of our buffer
+ /* Target address to relocate to for decompression */
+ leaq z_extract_offset(%rbp), %rbx
+
+ /* Set up the stack */
+ leaq boot_stack_end(%rbx), %rsp
+
+ /* Zero EFLAGS */
+ pushq $0
+ popfq
+
+/*
+ * Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- leaq _end_before_pgt(%rip), %r8
- leaq _end_before_pgt(%rbx), %r9
- movq $_end_before_pgt /* - $startup_32 */, %rcx
-1: subq $8, %r8
- subq $8, %r9
- movq 0(%r8), %rax
- movq %rax, 0(%r9)
- subq $8, %rcx
- jnz 1b
+ pushq %rsi
+ leaq (_bss-8)(%rip), %rsi
+ leaq (_bss-8)(%rbx), %rdi
+ movq $_bss /* - $startup_32 */, %rcx
+ shrq $3, %rcx
+ std
+ rep movsq
+ cld
+ popq %rsi
/*
* Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
leaq relocated(%rbx), %rax
jmp *%rax
-.section ".text"
+ .text
relocated:
/*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
*/
- xorq %rax, %rax
- leaq _edata(%rbx), %rdi
- leaq _end_before_pgt(%rbx), %rcx
+ xorl %eax, %eax
+ leaq _bss(%rip), %rdi
+ leaq _ebss(%rip), %rcx
subq %rdi, %rcx
- cld
- rep
- stosb
-
- /* Setup the stack */
- leaq boot_stack_end(%rip), %rsp
-
- /* zero EFLAGS after setting rsp */
- pushq $0
- popfq
+ shrq $3, %rcx
+ rep stosq
/*
* Do the decompression, and jump to the new kernel..
*/
- pushq %rsi # Save the real mode argument
- movq %rsi, %rdi # real mode address
- leaq boot_heap(%rip), %rsi # malloc area for uncompression
- leaq input_data(%rip), %rdx # input_data
- movl input_len(%rip), %eax
- movq %rax, %rcx # input_len
- movq %rbp, %r8 # output
+ pushq %rsi /* Save the real mode argument */
+ movq %rsi, %rdi /* real mode address */
+ leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
+ leaq input_data(%rip), %rdx /* input_data */
+ movl $z_input_len, %ecx /* input_len */
+ movq %rbp, %r8 /* output target address */
call decompress_kernel
popq %rsi
@@ -311,11 +308,21 @@ gdt:
.quad 0x0000000000000000 /* TS continued */
gdt_end:
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+ .bss
+ .balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+ .section ".pgtable","a",@nobits
+ .balign 4096
+pgtable:
+ .fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684f..842b2a36174 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,21 +325,19 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+ error("Destination address inappropriately aligned");
#ifdef CONFIG_X86_64
- if ((unsigned long)output & (__KERNEL_ALIGN - 1))
- error("Destination address not 2M aligned");
- if ((unsigned long)output >= 0xffffffffffUL)
+ if (heap > 0x3fffffffffffUL)
error("Destination address too large");
#else
- if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
- error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
error("Destination address too large");
+#endif
#ifndef CONFIG_RELOCATABLE
- if ((u32)output != LOAD_PHYSICAL_ADDR)
+ if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
error("Wrong destination address");
#endif
-#endif
if (!quiet)
putstr("\nDecompressing Linux... ");
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 00000000000..bcbd36c4143
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+ const uint8_t *cp = p;
+
+ return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+ ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+ uint32_t olen;
+ long ilen;
+ unsigned long offs;
+ FILE *f;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+ return 1;
+ }
+
+ /* Get the information for the compressed kernel image first */
+
+ f = fopen(argv[1], "r");
+ if (!f) {
+ perror(argv[1]);
+ return 1;
+ }
+
+
+ if (fseek(f, -4L, SEEK_END)) {
+ perror(argv[1]);
+ }
+ fread(&olen, sizeof olen, 1, f);
+ ilen = ftell(f);
+ olen = getle32(&olen);
+ fclose(f);
+
+ /*
+ * Now we have the input (compressed) and output (uncompressed)
+ * sizes, compute the necessary decompression offset...
+ */
+
+ offs = (olen > ilen) ? olen - ilen : 0;
+ offs += olen >> 12; /* Add 8 bytes for each 32K block */
+ offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
+ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+ printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+ printf(".globl z_input_len\n");
+ printf("z_input_len = %lu\n", ilen);
+ printf(".globl z_output_len\n");
+ printf("z_output_len = %lu\n", (unsigned long)olen);
+ printf(".globl z_extract_offset\n");
+ printf("z_extract_offset = 0x%lx\n", offs);
+ /* z_extract_offset_negative allows simplification of head_32.S */
+ printf(".globl z_extract_offset_negative\n");
+ printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+ printf(".globl input_data, input_data_end\n");
+ printf("input_data:\n");
+ printf(".incbin \"%s\"\n", argv[1]);
+ printf("input_data_end:\n");
+
+ return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bc..cc353e1b3ff 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,17 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#undef i386
+
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
OUTPUT_ARCH(i386:x86-64)
ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
SECTIONS
{
/* Be careful parts of head_64.S assume startup_32 is at
@@ -33,16 +44,22 @@ SECTIONS
*(.data.*)
_edata = . ;
}
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
*(.bss.*)
*(COMMON)
- . = ALIGN(8);
- _end_before_pgt = . ;
- . = ALIGN(4096);
- pgtable = . ;
- . = . + 4096 * 6;
+ . = ALIGN(8); /* For convenience during zeroing */
_ebss = .;
}
+#ifdef CONFIG_X86_64
+ . = ALIGN(PAGE_SIZE);
+ .pgtable : {
+ _pgtable = . ;
+ *(.pgtable)
+ _epgtable = . ;
+ }
+#endif
+ _end = .;
}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
- .rodata.compressed : {
- input_len = .;
- LONG(input_data_end - input_data) input_data = .;
- *(.data)
- output_len = . - 4;
- input_data_end = .;
- }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c4..00000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
- /* Be careful parts of head_32.S assume startup_32 is at
- * address 0.
- */
- . = 0;
- .text.head : {
- _head = . ;
- *(.text.head)
- _ehead = . ;
- }
- .rodata.compressed : {
- *(.rodata.compressed)
- }
- .text : {
- _text = .; /* Text */
- *(.text)
- *(.text.*)
- _etext = . ;
- }
- .rodata : {
- _rodata = . ;
- *(.rodata) /* read-only data */
- *(.rodata.*)
- _erodata = . ;
- }
- .data : {
- _data = . ;
- *(.data)
- *(.data.*)
- _edata = . ;
- }
- .bss : {
- _bss = . ;
- *(.bss)
- *(.bss.*)
- *(COMMON)
- _end = . ;
- }
-}
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca..c501a5b466f 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
*/
static int read_mbr(u8 devno, void *buf)
{
- u16 ax, bx, cx, dx;
+ struct biosregs ireg, oreg;
- ax = 0x0201; /* Legacy Read, one sector */
- cx = 0x0001; /* Sector 0-0-1 */
- dx = devno;
- bx = (size_t)buf;
- asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
- : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
- : : "esi", "edi", "memory");
+ initregs(&ireg);
+ ireg.ax = 0x0201; /* Legacy Read, one sector */
+ ireg.cx = 0x0001; /* Sector 0-0-1 */
+ ireg.dl = devno;
+ ireg.bx = (size_t)buf;
- return -(u8)ax; /* 0 or -1 */
+ intcall(0x13, &ireg, &oreg);
+
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
static int get_edd_info(u8 devno, struct edd_info *ei)
{
- u16 ax, bx, cx, dx, di;
+ struct biosregs ireg, oreg;
memset(ei, 0, sizeof *ei);
/* Check Extensions Present */
- ax = 0x4100;
- bx = EDDMAGIC1;
- dx = devno;
- asm("pushfl; stc; int $0x13; setc %%al; popfl"
- : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
- : : "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x41;
+ ireg.bx = EDDMAGIC1;
+ ireg.dl = devno;
+ intcall(0x13, &ireg, &oreg);
- if ((u8)ax)
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1; /* No extended information */
- if (bx != EDDMAGIC2)
+ if (oreg.bx != EDDMAGIC2)
return -1;
ei->device = devno;
- ei->version = ax >> 8; /* EDD version number */
- ei->interface_support = cx; /* EDD functionality subsets */
+ ei->version = oreg.ah; /* EDD version number */
+ ei->interface_support = oreg.cx; /* EDD functionality subsets */
/* Extended Get Device Parameters */
ei->params.length = sizeof(ei->params);
- ax = 0x4800;
- dx = devno;
- asm("pushfl; int $0x13; popfl"
- : "+a" (ax), "+d" (dx), "=m" (ei->params)
- : "S" (&ei->params)
- : "ebx", "ecx", "edi");
+ ireg.ah = 0x48;
+ ireg.si = (size_t)&ei->params;
+ intcall(0x13, &ireg, &oreg);
/* Get legacy CHS parameters */
/* Ralf Brown recommends setting ES:DI to 0:0 */
- ax = 0x0800;
- dx = devno;
- di = 0;
- asm("pushw %%es; "
- "movw %%di,%%es; "
- "pushfl; stc; int $0x13; setc %%al; popfl; "
- "popw %%es"
- : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
- : : "esi");
-
- if ((u8)ax == 0) {
- ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
- ei->legacy_max_head = dx >> 8;
- ei->legacy_sectors_per_track = cx & 0x3f;
+ ireg.ah = 0x08;
+ ireg.es = 0;
+ intcall(0x13, &ireg, &oreg);
+
+ if (!(oreg.eflags & X86_EFLAGS_CF)) {
+ ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+ ei->legacy_max_head = oreg.dh;
+ ei->legacy_sectors_per_track = oreg.cl & 0x3f;
}
return 0;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4..b31cc54b464 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
#include <asm/page_types.h>
#include <asm/setup.h>
#include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
BOOTSEG = 0x07C0 /* original address of boot-sector */
SYSSEG = 0x1000 /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x0209 # header version number (>= 0x0105)
+ .word 0x020a # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512
# end of setup code can be used by setup
# for local heap purposes.
-pad1: .word 0
+ext_loader_ver:
+ .byte 0 # Extended boot loader version
+ext_loader_type:
+ .byte 0 # Extended boot loader type
+
cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
# If nonzero, a 32-bit pointer
# to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel: .byte 1
#else
relocatable_kernel: .byte 0
#endif
-pad2: .byte 0
+min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment
pad3: .word 0
cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
hardware_subarch_data: .quad 0
-payload_offset: .long input_data
-payload_length: .long input_data_end-input_data
+payload_offset: .long ZO_input_data
+payload_length: .long ZO_z_input_len
setup_data: .quad 0 # 64-bit physical pointer to
# single linked list of
# struct setup_data
+pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr
+
+#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#define VO_INIT_SIZE (VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size: .long INIT_SIZE # kernel initialization size
+
# End of setup header #####################################################
- .section ".inittext", "ax"
+ .section ".entrytext", "ax"
start_of_setup:
#ifdef SAFE_RESET_DISK_CONTROLLER
# Reset the disk controller.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae..140172b895b 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
*/
static void keyboard_set_repeat(void)
{
- u16 ax = 0x0305;
- u16 bx = 0;
- asm volatile("int $0x16"
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
+ struct biosregs ireg;
+ initregs(&ireg);
+ ireg.ax = 0x0305;
+ intcall(0x16, &ireg, NULL);
}
/*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
*/
static void query_ist(void)
{
+ struct biosregs ireg, oreg;
+
/* Some older BIOSes apparently crash on this call, so filter
it from machines too old to have SpeedStep at all. */
if (cpu.level < 6)
return;
- asm("int $0x15"
- : "=a" (boot_params.ist_info.signature),
- "=b" (boot_params.ist_info.command),
- "=c" (boot_params.ist_info.event),
- "=d" (boot_params.ist_info.perf_level)
- : "a" (0x0000e980), /* IST Support */
- "d" (0x47534943)); /* Request value */
+ initregs(&ireg);
+ ireg.ax = 0xe980; /* IST Support */
+ ireg.edx = 0x47534943; /* Request value */
+ intcall(0x15, &ireg, &oreg);
+
+ boot_params.ist_info.signature = oreg.eax;
+ boot_params.ist_info.command = oreg.ebx;
+ boot_params.ist_info.event = oreg.ecx;
+ boot_params.ist_info.perf_level = oreg.edx;
}
/*
@@ -93,13 +97,12 @@ static void query_ist(void)
static void set_bios_mode(void)
{
#ifdef CONFIG_X86_64
- u32 eax, ebx;
+ struct biosregs ireg;
- eax = 0xec00;
- ebx = 2;
- asm volatile("int $0x15"
- : "+a" (eax), "+b" (ebx)
- : : "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ax = 0xec00;
+ ireg.bx = 2;
+ intcall(0x15, &ireg, NULL);
#endif
}
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d69..a95a531148e 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
int query_mca(void)
{
- u8 err;
- u16 es, bx, len;
-
- asm("pushw %%es ; "
- "int $0x15 ; "
- "setc %0 ; "
- "movw %%es, %1 ; "
- "popw %%es"
- : "=acd" (err), "=acdSD" (es), "=b" (bx)
- : "a" (0xc000));
-
- if (err)
+ struct biosregs ireg, oreg;
+ u16 len;
+
+ initregs(&ireg);
+ ireg.ah = 0xc0;
+ intcall(0x15, &ireg, &oreg);
+
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1; /* No MCA present */
- set_fs(es);
- len = rdfs16(bx);
+ set_fs(oreg.es);
+ len = rdfs16(oreg.bx);
if (len > sizeof(boot_params.sys_desc_table))
len = sizeof(boot_params.sys_desc_table);
- copy_from_fs(&boot_params.sys_desc_table, bx, len);
+ copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
return 0;
}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 74b3d2ba84e..cae3feb1035 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -20,12 +20,16 @@
static int detect_memory_e820(void)
{
int count = 0;
- u32 next = 0;
- u32 size, id, edi;
- u8 err;
+ struct biosregs ireg, oreg;
struct e820entry *desc = boot_params.e820_map;
static struct e820entry buf; /* static so it is zeroed */
+ initregs(&ireg);
+ ireg.ax = 0xe820;
+ ireg.cx = sizeof buf;
+ ireg.edx = SMAP;
+ ireg.di = (size_t)&buf;
+
/*
* Note: at least one BIOS is known which assumes that the
* buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
*/
do {
- size = sizeof buf;
-
- /* Important: %edx and %esi are clobbered by some BIOSes,
- so they must be either used for the error output
- or explicitly marked clobbered. Given that, assume there
- is something out there clobbering %ebp and %edi, too. */
- asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
- : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
- "=D" (edi), "+m" (buf)
- : "D" (&buf), "d" (SMAP), "a" (0xe820)
- : "esi");
+ intcall(0x15, &ireg, &oreg);
+ ireg.ebx = oreg.ebx; /* for next iteration... */
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
the final, failing, probe. */
- if (err)
+ if (oreg.eflags & X86_EFLAGS_CF)
break;
/* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
screwed up the map at that point, we might have a
partial map, the full map, or complete garbage, so
just return failure. */
- if (id != SMAP) {
+ if (oreg.eax != SMAP) {
count = 0;
break;
}
*desc++ = buf;
count++;
- } while (next && count < ARRAY_SIZE(boot_params.e820_map));
+ } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
return boot_params.e820_entries = count;
}
static int detect_memory_e801(void)
{
- u16 ax, bx, cx, dx;
- u8 err;
+ struct biosregs ireg, oreg;
- bx = cx = dx = 0;
- ax = 0xe801;
- asm("stc; int $0x15; setc %0"
- : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+ initregs(&ireg);
+ ireg.ax = 0xe801;
+ intcall(0x15, &ireg, &oreg);
- if (err)
+ if (oreg.eflags & X86_EFLAGS_CF)
return -1;
/* Do we really need to do this? */
- if (cx || dx) {
- ax = cx;
- bx = dx;
+ if (oreg.cx || oreg.dx) {
+ oreg.ax = oreg.cx;
+ oreg.bx = oreg.dx;
}
- if (ax > 15*1024)
+ if (oreg.ax > 15*1024) {
return -1; /* Bogus! */
-
- /* This ignores memory above 16MB if we have a memory hole
- there. If someone actually finds a machine with a memory
- hole at 16MB and no support for 0E820h they should probably
- generate a fake e820 map. */
- boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+ } else if (oreg.ax == 15*1024) {
+ boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+ } else {
+ /*
+ * This ignores memory above 16MB if we have a memory
+ * hole there. If someone actually finds a machine
+ * with a memory hole at 16MB and no support for
+ * 0E820h they should probably generate a fake e820
+ * map.
+ */
+ boot_params.alt_mem_k = oreg.ax;
+ }
return 0;
}
static int detect_memory_88(void)
{
- u16 ax;
- u8 err;
+ struct biosregs ireg, oreg;
- ax = 0x8800;
- asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+ initregs(&ireg);
+ ireg.ah = 0x88;
+ intcall(0x15, &ireg, &oreg);
- boot_params.screen_info.ext_mem_k = ax;
+ boot_params.screen_info.ext_mem_k = oreg.ax;
- return -err;
+ return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
}
int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 00000000000..958019b1cfa
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2 or (at your
+ * option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+ memset(reg, 0, sizeof *reg);
+ reg->eflags |= X86_EFLAGS_CF;
+ reg->ds = ds();
+ reg->es = ds();
+ reg->fs = fs();
+ reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de796..0f6ec455a2b 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
. = 497;
.header : { *(.header) }
+ .entrytext : { *(.entrytext) }
.inittext : { *(.inittext) }
.initdata : { *(.initdata) }
+ __end_init = .;
+
.text : { *(.text) }
.text32 : { *(.text32) }
@@ -52,4 +55,7 @@ SECTIONS
. = ASSERT(_end <= 0x8000, "Setup too big!");
. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+ /* Necessary for the very-old-loader check to work... */
+ . = ASSERT(__end_init <= 5*512, "init sections too big!");
+
}
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f..01ec69c901c 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
void __attribute__((section(".inittext"))) putchar(int ch)
{
- unsigned char c = ch;
+ struct biosregs ireg;
- if (c == '\n')
+ if (ch == '\n')
putchar('\r'); /* \n -> \r\n */
- /* int $0x10 is known to have bugs involving touching registers
- it shouldn't. Be extra conservative... */
- asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
- : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+ initregs(&ireg);
+ ireg.bx = 0x0007;
+ ireg.cx = 0x0001;
+ ireg.ah = 0x0e;
+ ireg.al = ch;
+ intcall(0x10, &ireg, NULL);
}
void __attribute__((section(".inittext"))) puts(const char *str)
{
- int n = 0;
- while (*str) {
+ while (*str)
putchar(*str++);
- n++;
- }
}
/*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
static u8 gettime(void)
{
- u16 ax = 0x0200;
- u16 cx, dx;
+ struct biosregs ireg, oreg;
- asm volatile("int $0x1a"
- : "+a" (ax), "=c" (cx), "=d" (dx)
- : : "ebx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x02;
+ intcall(0x1a, &ireg, &oreg);
- return dx >> 8;
+ return oreg.dh;
}
/*
@@ -64,19 +63,24 @@ static u8 gettime(void)
*/
int getchar(void)
{
- u16 ax = 0;
- asm volatile("int $0x16" : "+a" (ax));
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ /* ireg.ah = 0x00; */
+ intcall(0x16, &ireg, &oreg);
- return ax & 0xff;
+ return oreg.al;
}
static int kbd_pending(void)
{
- u8 pending;
- asm volatile("int $0x16; setnz %0"
- : "=qm" (pending)
- : "a" (0x0100));
- return pending;
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+ ireg.ah = 0x01;
+ intcall(0x16, &ireg, &oreg);
+
+ return !(oreg.eflags & X86_EFLAGS_ZF);
}
void kbd_flush(void)
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c36..d660be49236 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
static int set_bios_mode(u8 mode)
{
- u16 ax;
+ struct biosregs ireg, oreg;
u8 new_mode;
- ax = mode; /* AH=0x00 Set Video Mode */
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.al = mode; /* AH=0x00 Set Video Mode */
+ intcall(0x10, &ireg, NULL);
- ax = 0x0f00; /* Get Current Video Mode */
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+
+ ireg.ah = 0x0f; /* Get Current Video Mode */
+ intcall(0x10, &ireg, &oreg);
do_restore = 1; /* Assume video contents were lost */
- new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */
+
+ /* Not all BIOSes are clean with the top bit */
+ new_mode = ireg.al & 0x7f;
if (new_mode == mode)
return 0; /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
/* Mode setting failed, but we didn't end up where we
started. That's bad. Try to revert to the original
video mode. */
- ax = boot_params.screen_info.orig_video_mode;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = boot_params.screen_info.orig_video_mode;
+ intcall(0x10, &ireg, NULL);
}
#endif
return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f6..c700147d6ff 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
static int vesa_probe(void)
{
#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
- u16 ax, cx, di;
+ struct biosregs ireg, oreg;
u16 mode;
addr_t mode_ptr;
struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
video_vesa.modes = GET_HEAP(struct mode_info, 0);
- ax = 0x4f00;
- di = (size_t)&vginfo;
- asm(INT10
- : "+a" (ax), "+D" (di), "=m" (vginfo)
- : : "ebx", "ecx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f00;
+ ireg.di = (size_t)&vginfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f ||
+ if (ireg.ax != 0x004f ||
vginfo.signature != VESA_MAGIC ||
vginfo.version < 0x0102)
return 0; /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
- ax = 0x4f01;
- cx = mode;
- di = (size_t)&vminfo;
- asm(INT10
- : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
- : : "ebx", "edx", "esi");
+ ireg.ax = 0x4f01;
+ ireg.cx = mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (ireg.ax != 0x004f)
continue;
if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
static int vesa_set_mode(struct mode_info *mode)
{
- u16 ax, bx, cx, di;
+ struct biosregs ireg, oreg;
int is_graphic;
u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
- ax = 0x4f01;
- cx = vesa_mode;
- di = (size_t)&vminfo;
- asm(INT10
- : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
- : : "ebx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f01;
+ ireg.cx = vesa_mode;
+ ireg.di = (size_t)&vminfo;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return -1;
if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
}
- ax = 0x4f02;
- bx = vesa_mode;
- di = 0;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx), "+D" (di)
- : : "ecx", "edx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f02;
+ ireg.bx = vesa_mode;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return -1;
graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
/* Switch DAC to 8-bit mode */
static void vesa_dac_set_8bits(void)
{
+ struct biosregs ireg, oreg;
u8 dac_size = 6;
/* If possible, switch the DAC to 8-bit mode */
if (vginfo.capabilities & 1) {
- u16 ax, bx;
-
- ax = 0x4f08;
- bx = 0x0800;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx)
- : : "ecx", "edx", "esi", "edi");
-
- if (ax == 0x004f)
- dac_size = bx >> 8;
+ initregs(&ireg);
+ ireg.ax = 0x4f08;
+ ireg.bh = 0x08;
+ intcall(0x10, &ireg, &oreg);
+ if (oreg.ax == 0x004f)
+ dac_size = oreg.bh;
}
/* Set the color sizes to the DAC size, and offsets to 0 */
- boot_params.screen_info.red_size = dac_size;
+ boot_params.screen_info.red_size = dac_size;
boot_params.screen_info.green_size = dac_size;
- boot_params.screen_info.blue_size = dac_size;
- boot_params.screen_info.rsvd_size = dac_size;
+ boot_params.screen_info.blue_size = dac_size;
+ boot_params.screen_info.rsvd_size = dac_size;
- boot_params.screen_info.red_pos = 0;
- boot_params.screen_info.green_pos = 0;
- boot_params.screen_info.blue_pos = 0;
- boot_params.screen_info.rsvd_pos = 0;
+ boot_params.screen_info.red_pos = 0;
+ boot_params.screen_info.green_pos = 0;
+ boot_params.screen_info.blue_pos = 0;
+ boot_params.screen_info.rsvd_pos = 0;
}
/* Save the VESA protected mode info */
static void vesa_store_pm_info(void)
{
- u16 ax, bx, di, es;
+ struct biosregs ireg, oreg;
- ax = 0x4f0a;
- bx = di = 0;
- asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
- : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
- : : "ecx", "esi");
+ initregs(&ireg);
+ ireg.ax = 0x4f0a;
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return;
- boot_params.screen_info.vesapm_seg = es;
- boot_params.screen_info.vesapm_off = di;
+ boot_params.screen_info.vesapm_seg = oreg.es;
+ boot_params.screen_info.vesapm_off = oreg.di;
}
/*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
void vesa_store_edid(void)
{
#ifdef CONFIG_FIRMWARE_EDID
- u16 ax, bx, cx, dx, di;
+ struct biosregs ireg, oreg;
/* Apparently used as a nonsense token... */
memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
if (vginfo.version < 0x0200)
return; /* EDID requires VBE 2.0+ */
- ax = 0x4f15; /* VBE DDC */
- bx = 0x0000; /* Report DDC capabilities */
- cx = 0; /* Controller 0 */
- di = 0; /* ES:DI must be 0 by spec */
-
- /* Note: The VBE DDC spec is different from the main VESA spec;
- we genuinely have to assume all registers are destroyed here. */
-
- asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
- : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
- : : "esi", "edx");
+ initregs(&ireg);
+ ireg.ax = 0x4f15; /* VBE DDC */
+ /* ireg.bx = 0x0000; */ /* Report DDC capabilities */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ ireg.es = 0; /* ES:DI must be 0 by spec */
+ intcall(0x10, &ireg, &oreg);
- if (ax != 0x004f)
+ if (oreg.ax != 0x004f)
return; /* No EDID */
/* BH = time in seconds to transfer EDD information */
/* BL = DDC level supported */
- ax = 0x4f15; /* VBE DDC */
- bx = 0x0001; /* Read EDID */
- cx = 0; /* Controller 0 */
- dx = 0; /* EDID block number */
- di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
- asm(INT10
- : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
- "+c" (cx), "+D" (di)
- : : "esi");
+ ireg.ax = 0x4f15; /* VBE DDC */
+ ireg.bx = 0x0001; /* Read EDID */
+ /* ireg.cx = 0; */ /* Controller 0 */
+ /* ireg.dx = 0; */ /* EDID block number */
+ ireg.es = ds();
+ ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+ intcall(0x10, &ireg, &oreg);
#endif /* CONFIG_FIRMWARE_EDID */
}
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a3776..8f8d827e254 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
/* Set basic 80x25 mode */
static u8 vga_set_basic_mode(void)
{
+ struct biosregs ireg, oreg;
u16 ax;
u8 rows;
u8 mode;
+ initregs(&ireg);
+
#ifdef CONFIG_VIDEO_400_HACK
if (adapter >= ADAPTER_VGA) {
- asm volatile(INT10
- : : "a" (0x1202), "b" (0x0030)
- : "ecx", "edx", "esi", "edi");
+ ireg.ax = 0x1202;
+ ireg.bx = 0x0030;
+ intcall(0x10, &ireg, NULL);
}
#endif
ax = 0x0f00;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
-
- mode = (u8)ax;
+ intcall(0x10, &ireg, &oreg);
+ mode = oreg.al;
set_fs(0);
rows = rdfs8(0x484); /* rows minus one */
#ifndef CONFIG_VIDEO_400_HACK
- if ((ax == 0x5003 || ax == 0x5007) &&
+ if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
(rows == 0 || rows == 24))
return mode;
#endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
mode = 3;
/* Set the mode */
- ax = mode;
- asm volatile(INT10
- : "+a" (ax)
- : : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = mode; /* AH=0: set mode */
+ intcall(0x10, &ireg, NULL);
do_restore = 1;
return mode;
}
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
static void vga_set_8font(void)
{
/* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 8x8 font */
- asm volatile(INT10 : : "a" (0x1112), "b" (0));
+ ireg.ax = 0x1112;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
/* Use alternate print screen */
- asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+ ireg.ax = 0x1200;
+ ireg.bl = 0x20;
+ intcall(0x10, &ireg, NULL);
/* Turn off cursor emulation */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
/* Cursor is scan lines 6-7 */
- asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0607;
+ intcall(0x10, &ireg, NULL);
}
static void vga_set_14font(void)
{
/* Set 9x14 font - 80x28 on VGA */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 9x14 font */
- asm volatile(INT10 : : "a" (0x1111), "b" (0));
+ ireg.ax = 0x1111;
+ /* ireg.bl = 0; */
+ intcall(0x10, &ireg, NULL);
/* Turn off cursor emulation */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x34;
+ intcall(0x10, &ireg, NULL);
/* Cursor is scan lines 11-12 */
- asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+ ireg.ax = 0x0100;
+ ireg.cx = 0x0b0c;
+ intcall(0x10, &ireg, NULL);
}
static void vga_set_80x43(void)
{
/* Set 80x43 mode on VGA (not EGA) */
+ struct biosregs ireg;
+
+ initregs(&ireg);
/* Set 350 scans */
- asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+ ireg.ax = 0x1201;
+ ireg.bl = 0x30;
+ intcall(0x10, &ireg, NULL);
/* Reset video mode */
- asm volatile(INT10 : : "a" (0x0003));
+ ireg.ax = 0x0003;
+ intcall(0x10, &ireg, NULL);
vga_set_8font();
}
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
*/
static int vga_probe(void)
{
- u16 ega_bx;
-
static const char *card_name[] = {
"CGA/MDA/HGC", "EGA", "VGA"
};
@@ -240,26 +263,26 @@ static int vga_probe(void)
sizeof(ega_modes)/sizeof(struct mode_info),
sizeof(vga_modes)/sizeof(struct mode_info),
};
- u8 vga_flag;
- asm(INT10
- : "=b" (ega_bx)
- : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
- : "ecx", "edx", "esi", "edi");
+ struct biosregs ireg, oreg;
+
+ initregs(&ireg);
+
+ ireg.ax = 0x1200;
+ ireg.bl = 0x10; /* Check EGA/VGA */
+ intcall(0x10, &ireg, &oreg);
#ifndef _WAKEUP
- boot_params.screen_info.orig_video_ega_bx = ega_bx;
+ boot_params.screen_info.orig_video_ega_bx = oreg.bx;
#endif
/* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
- if ((u8)ega_bx != 0x10) {
+ if (oreg.bl != 0x10) {
/* EGA/VGA */
- asm(INT10
- : "=a" (vga_flag)
- : "a" (0x1a00)
- : "ebx", "ecx", "edx", "esi", "edi");
+ ireg.ax = 0x1a00;
+ intcall(0x10, &ireg, &oreg);
- if (vga_flag == 0x1a) {
+ if (oreg.al == 0x1a) {
adapter = ADAPTER_VGA;
#ifndef _WAKEUP
boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe..bad728b76fc 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
static void store_cursor_position(void)
{
- u16 curpos;
- u16 ax, bx;
+ struct biosregs ireg, oreg;
- ax = 0x0300;
- bx = 0;
- asm(INT10
- : "=d" (curpos), "+a" (ax), "+b" (bx)
- : : "ecx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x03;
+ intcall(0x10, &ireg, &oreg);
- boot_params.screen_info.orig_x = curpos;
- boot_params.screen_info.orig_y = curpos >> 8;
+ boot_params.screen_info.orig_x = oreg.dl;
+ boot_params.screen_info.orig_y = oreg.dh;
}
static void store_video_mode(void)
{
- u16 ax, page;
+ struct biosregs ireg, oreg;
/* N.B.: the saving of the video page here is a bit silly,
since we pretty much assume page 0 everywhere. */
- ax = 0x0f00;
- asm(INT10
- : "+a" (ax), "=b" (page)
- : : "ecx", "edx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x0f;
+ intcall(0x10, &ireg, &oreg);
/* Not all BIOSes are clean with respect to the top bit */
- boot_params.screen_info.orig_video_mode = ax & 0x7f;
- boot_params.screen_info.orig_video_page = page >> 8;
+ boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+ boot_params.screen_info.orig_video_page = oreg.bh;
}
/*
@@ -257,7 +254,7 @@ static void restore_screen(void)
int y;
addr_t dst = 0;
u16 *src = saved.data;
- u16 ax, bx, dx;
+ struct biosregs ireg;
if (graphic_mode)
return; /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
}
/* Restore cursor position */
- ax = 0x0200; /* Set cursor position */
- bx = 0; /* Page number (<< 8) */
- dx = (saved.cury << 8)+saved.curx;
- asm volatile(INT10
- : "+a" (ax), "+b" (bx), "+d" (dx)
- : : "ecx", "esi", "edi");
+ initregs(&ireg);
+ ireg.ah = 0x02; /* Set cursor position */
+ ireg.dh = saved.cury;
+ ireg.dl = saved.curx;
+ intcall(0x10, &ireg, NULL);
}
#else
#define save_screen() ((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d1446..5bb174a997f 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */
extern int do_restore; /* Restore screen contents */
extern int graphic_mode; /* Graphics mode with linear frame buffer */
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling. There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
/* Accessing VGA indexed registers */
static inline u8 in_idx(u16 port, u8 index)
{
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f..edb992ebef9 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:21:55 2009
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
# CONFIG_X86_64 is not set
CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
# CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
# CONFIG_AUDIT_ARCH is not set
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_32_SMP=y
CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
CONFIG_KTIME_SCALAR=y
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
CONFIG_HAVE_GENERIC_DMA_COHERENT=y
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
# CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
# CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
# CONFIG_GENERIC_CPU is not set
CONFIG_X86_GENERIC=y
CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
CONFIG_X86_XADD=y
# CONFIG_X86_PPRO_FENCE is not set
CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
CONFIG_CPU_SUP_INTEL=y
CONFIG_CPU_SUP_CYRIX_32=y
CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
CONFIG_CPU_SUP_TRANSMETA_32=y
CONFIG_CPU_SUP_UMC_32=y
CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
# CONFIG_NOHIGHMEM is not set
CONFIG_HIGHMEM4G=y
# CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_HIGHPTE=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
# CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
CONFIG_PCIEPORTBUS=y
# CONFIG_HOTPLUG_PCI_PCIE is not set
CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
CONFIG_ISA_DMA_API=y
# CONFIG_ISA is not set
# CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
#
# Networking options
#
-CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
#
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
#
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_CFG80211=y
# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
# CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
# CONFIG_C2PORT is not set
#
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_LOWLEVEL is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
# CONFIG_IFB is not set
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
CONFIG_NET_VENDOR_3COM=y
# CONFIG_VORTEX is not set
# CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
# CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
CONFIG_E1000E=y
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_BNX2X is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
# CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
# CONFIG_LIBERTAS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
# CONFIG_AIRO_CS is not set
# CONFIG_PCMCIA_WL3501 is not set
# CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
# CONFIG_RTL8187 is not set
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
# CONFIG_P54_COMMON is not set
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
# CONFIG_IPW2100 is not set
# CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
# CONFIG_HOSTAP is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
# CONFIG_ZD1211RW is not set
# CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
#
# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
# CONFIG_TABLET_USB_KBTAB is not set
# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
CONFIG_HW_RANDOM_INTEL=y
CONFIG_HW_RANDOM_AMD=y
CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_K8TEMP is not set
# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_SIS5595 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
# CONFIG_FB_3DFX is not set
# CONFIG_FB_VOODOO1 is not set
# CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
# CONFIG_FB_TRIDENT is not set
# CONFIG_FB_ARK is not set
# CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
# CONFIG_SND_INDIGO is not set
# CONFIG_SND_INDIGOIO is not set
# CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
# CONFIG_SND_EMU10K1 is not set
# CONFIG_SND_EMU10K1X is not set
# CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
#
# Special HID drivers
#
-CONFIG_HID_COMPAT=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
CONFIG_HID_CHERRY=y
CONFIG_HID_CHICONY=y
CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#
#
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
#
# OTG and related infrastructure
#
+# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
#
# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
#
# LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
# DMA Devices
#
# CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set
CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
#
# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
@@ -2101,6 +2151,11 @@ CONFIG_AUTOFS4_FS=y
CONFIG_GENERIC_ACL=y
#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ROMFS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
# CONFIG_DEBUG_KOBJECT is not set
# CONFIG_DEBUG_HIGHMEM is not set
CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
#
# Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
# CONFIG_BOOT_TRACER is not set
# CONFIG_TRACE_BRANCH_PROFILING is not set
# CONFIG_POWER_TRACER is not set
# CONFIG_STACK_TRACER is not set
# CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
CONFIG_DEBUG_NX_TEST=m
# CONFIG_4KSTACKS is not set
CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
CONFIG_IO_DELAY_TYPE_0X80=0
CONFIG_IO_DELAY_TYPE_0XED=1
CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
CONFIG_CRYPTO=y
#
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
# CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
# Compression
#
# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set
#
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_GEODE is not set
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
CONFIG_VIRTUALIZATION=y
# CONFIG_KVM is not set
# CONFIG_LGUEST is not set
# CONFIG_VIRTIO_PCI is not set
# CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
#
# Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
# CONFIG_LIBCRC32C is not set
CONFIG_AUDIT_GENERIC=y
CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4..cee1dd2e69b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:22:00 2009
#
CONFIG_64BIT=y
# CONFIG_X86_32 is not set
CONFIG_X86_64=y
CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
CONFIG_GENERIC_TIME=y
CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_DEFAULT_IDLE=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_AUDIT_ARCH=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
CONFIG_USE_GENERIC_SMP_HELPERS=y
CONFIG_X86_64_SMP=y
CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
CONFIG_X86_TRAMPOLINE=y
# CONFIG_KTIME_SCALAR is not set
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_BSD_PROCESS_ACCT=y
# CONFIG_BSD_PROCESS_ACCT_V3 is not set
CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
CONFIG_NET_NS=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
# CONFIG_EMBEDDED is not set
CONFIG_UID16=y
CONFIG_SYSCTL_SYSCALL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
CONFIG_HOTPLUG=y
CONFIG_PRINTK=y
CONFIG_BUG=y
CONFIG_ELF_CORE=y
CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
CONFIG_EPOLL=y
CONFIG_SIGNALFD=y
CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
CONFIG_VM_EVENT_COUNTERS=y
CONFIG_PCI_QUIRKS=y
CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
# CONFIG_SLAB is not set
CONFIG_SLUB=y
# CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
CONFIG_HAVE_KPROBES=y
CONFIG_HAVE_KRETPROBES=y
CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
CONFIG_SLABINFO=y
CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
# CONFIG_MODULE_SRCVERSION_ALL is not set
CONFIG_STOP_MACHINE=y
CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
CONFIG_BLK_DEV_BSG=y
# CONFIG_BLK_DEV_INTEGRITY is not set
CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
CONFIG_SMP=y
CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
# CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_PARAVIRT_GUEST is not set
# CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
# CONFIG_MCORE2 is not set
CONFIG_GENERIC_CPU=y
CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
CONFIG_X86_WP_WORKS_OK=y
CONFIG_X86_TSC=y
CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
CONFIG_X86_DEBUGCTLMSR=y
CONFIG_CPU_SUP_INTEL=y
CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
CONFIG_X86_DS=y
CONFIG_X86_PTRACE_BTS=y
CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
CONFIG_X86_MCE=y
CONFIG_X86_MCE_INTEL=y
CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
# CONFIG_I8K is not set
CONFIG_MICROCODE=y
CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
CONFIG_MICROCODE_OLD_INTERFACE=y
CONFIG_X86_MSR=y
CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
CONFIG_DIRECT_GBPAGES=y
CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
CONFIG_BOUNCE=y
CONFIG_VIRT_TO_BUS=y
CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
CONFIG_X86_CHECK_BIOS_CORRUPTION=y
CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
CONFIG_X86_PAT=y
CONFIG_EFI=y
CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
# CONFIG_HZ_250 is not set
# CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
CONFIG_SCHED_HRTICK=y
CONFIG_KEXEC=y
CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
CONFIG_HOTPLUG_CPU=y
# CONFIG_COMPAT_VDSO is not set
# CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
# CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
# CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
# CONFIG_PCI_DEBUG is not set
# CONFIG_PCI_STUB is not set
CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
CONFIG_ISA_DMA_API=y
CONFIG_K8_NB=y
CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
#
# Networking options
#
-CONFIG_COMPAT_NET_DEV_OPS=y
CONFIG_PACKET=y
CONFIG_PACKET_MMAP=y
CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
# CONFIG_LAPB is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
CONFIG_NET_SCHED=y
#
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
#
# CONFIG_NET_PKTGEN is not set
# CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
CONFIG_HAMRADIO=y
#
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
# CONFIG_IRDA is not set
# CONFIG_BT is not set
# CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
CONFIG_FIB_RULES=y
CONFIG_WIRELESS=y
CONFIG_CFG80211=y
# CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
CONFIG_WIRELESS_OLD_REGULATORY=y
CONFIG_WIRELESS_EXT=y
CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
# CONFIG_TIFM_CORE is not set
# CONFIG_ICS932S401 is not set
# CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
# CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
# CONFIG_C2PORT is not set
#
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
# CONFIG_SCSI_LOWLEVEL is not set
# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
CONFIG_ATA=y
# CONFIG_ATA_NONSTANDARD is not set
CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
CONFIG_MACINTOSH_DRIVERS=y
CONFIG_MAC_EMUMOUSEBTN=y
CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
# CONFIG_IFB is not set
# CONFIG_DUMMY is not set
# CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
CONFIG_NET_VENDOR_3COM=y
# CONFIG_VORTEX is not set
# CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
CONFIG_NET_TULIP=y
# CONFIG_DE2104X is not set
# CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
# CONFIG_E1000E is not set
# CONFIG_IP1000 is not set
# CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
# CONFIG_NS83820 is not set
# CONFIG_HAMACHI is not set
# CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
# CONFIG_QLA3XXX is not set
# CONFIG_ATL1 is not set
# CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
# CONFIG_JME is not set
CONFIG_NETDEV_10000=y
# CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_IXGBE is not set
# CONFIG_IXGB is not set
# CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
# CONFIG_MYRI10GE is not set
# CONFIG_NETXEN_NIC is not set
# CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
# CONFIG_BNX2X is not set
# CONFIG_QLGE is not set
# CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
CONFIG_TR=y
# CONFIG_IBMOL is not set
# CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
# CONFIG_LIBERTAS is not set
# CONFIG_LIBERTAS_THINFIRM is not set
# CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
# CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
# CONFIG_AIRO_CS is not set
# CONFIG_PCMCIA_WL3501 is not set
# CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
# CONFIG_RTL8187 is not set
# CONFIG_ADM8211 is not set
# CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
# CONFIG_P54_COMMON is not set
CONFIG_ATH5K=y
# CONFIG_ATH5K_DEBUG is not set
# CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
# CONFIG_IPW2100 is not set
# CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
# CONFIG_HOSTAP is not set
# CONFIG_B43 is not set
# CONFIG_B43LEGACY is not set
# CONFIG_ZD1211RW is not set
# CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
#
# Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
# CONFIG_TABLET_USB_KBTAB is not set
# CONFIG_TABLET_USB_WACOM is not set
CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
# CONFIG_TOUCHSCREEN_FUJITSU is not set
# CONFIG_TOUCHSCREEN_GUNZE is not set
# CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
# CONFIG_LEGACY_PTYS is not set
# CONFIG_IPMI_HANDLER is not set
CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
# CONFIG_HW_RANDOM_INTEL is not set
# CONFIG_HW_RANDOM_AMD is not set
CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
# CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_MAX6875 is not set
# CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_ADT7475 is not set
# CONFIG_SENSORS_K8TEMP is not set
# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
# CONFIG_SENSORS_ATXP1 is not set
# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_FSCHER is not set
# CONFIG_SENSORS_FSCPOS is not set
# CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
# CONFIG_SENSORS_GL518SM is not set
# CONFIG_SENSORS_GL520SM is not set
# CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
# CONFIG_SENSORS_LM90 is not set
# CONFIG_SENSORS_LM92 is not set
# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
# CONFIG_SENSORS_MAX1619 is not set
# CONFIG_SENSORS_MAX6650 is not set
# CONFIG_SENSORS_PC87360 is not set
# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
# CONFIG_SENSORS_SIS5595 is not set
# CONFIG_SENSORS_DME1737 is not set
# CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
# CONFIG_FB_VIRTUAL is not set
# CONFIG_FB_METRONOME is not set
# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
CONFIG_BACKLIGHT_LCD_SUPPORT=y
# CONFIG_LCD_CLASS_DEVICE is not set
CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
# CONFIG_SND_INDIGO is not set
# CONFIG_SND_INDIGOIO is not set
# CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
# CONFIG_SND_EMU10K1 is not set
# CONFIG_SND_EMU10K1X is not set
# CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
#
# Special HID drivers
#
-CONFIG_HID_COMPAT=y
CONFIG_HID_A4TECH=y
CONFIG_HID_APPLE=y
CONFIG_HID_BELKIN=y
CONFIG_HID_CHERRY=y
CONFIG_HID_CHICONY=y
CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
CONFIG_HID_LOGITECH=y
CONFIG_LOGITECH_FF=y
# CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
# CONFIG_USB_TMC is not set
#
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
#
#
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
#
CONFIG_USB_STORAGE=y
# CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
# CONFIG_USB_LED is not set
# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
# CONFIG_USB_IDMOUSE is not set
# CONFIG_USB_FTDI_ELAN is not set
# CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
#
# OTG and related infrastructure
#
+# CONFIG_NOP_USB_XCEIV is not set
# CONFIG_UWB is not set
# CONFIG_MMC is not set
# CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
#
# CONFIG_LEDS_ALIX2 is not set
# CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
# CONFIG_LEDS_CLEVO_MAIL is not set
# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
#
# LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
# CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
# CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
# CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
# CONFIG_ACCESSIBILITY is not set
# CONFIG_INFINIBAND is not set
CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
# DMA Devices
#
# CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
# CONFIG_UIO is not set
# CONFIG_STAGING is not set
CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
#
# CONFIG_EXT2_FS is not set
CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
CONFIG_EXT3_FS_XATTR=y
CONFIG_EXT3_FS_POSIX_ACL=y
CONFIG_EXT3_FS_SECURITY=y
@@ -2082,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
CONFIG_GENERIC_ACL=y
#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
# CD-ROM/DVD Filesystems
#
CONFIG_ISO9660_FS=y
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
# CONFIG_ROMFS_FS is not set
# CONFIG_SYSV_FS is not set
# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
CONFIG_NETWORK_FILESYSTEMS=y
CONFIG_NFS_FS=y
CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
CONFIG_NFS_COMMON=y
CONFIG_SUNRPC=y
CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
CONFIG_RPCSEC_GSS_KRB5=y
# CONFIG_RPCSEC_GSS_SPKM3 is not set
# CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
CONFIG_DEBUG_KERNEL=y
# CONFIG_DEBUG_SHIRQ is not set
# CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
# CONFIG_SCHED_DEBUG is not set
CONFIG_SCHEDSTATS=y
CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
# CONFIG_LOCK_STAT is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
# CONFIG_DEBUG_KOBJECT is not set
CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
# CONFIG_FAULT_INJECTION is not set
# CONFIG_LATENCYTOP is not set
CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
CONFIG_HAVE_DYNAMIC_FTRACE=y
CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
#
# Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
# CONFIG_SYSPROF_TRACER is not set
# CONFIG_SCHED_TRACER is not set
# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
# CONFIG_BOOT_TRACER is not set
# CONFIG_TRACE_BRANCH_PROFILING is not set
# CONFIG_POWER_TRACER is not set
# CONFIG_STACK_TRACER is not set
# CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
# CONFIG_SAMPLES is not set
CONFIG_HAVE_ARCH_KGDB=y
# CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
CONFIG_EARLY_PRINTK_DBGP=y
CONFIG_DEBUG_STACKOVERFLOW=y
CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
# CONFIG_DEBUG_PER_CPU_MAPS is not set
# CONFIG_X86_PTDUMP is not set
CONFIG_DEBUG_RODATA=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_NX_TEST=m
# CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
CONFIG_IO_DELAY_TYPE_0X80=0
CONFIG_IO_DELAY_TYPE_0XED=1
CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
# CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
# CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
CONFIG_CRYPTO=y
#
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_HASH2=y
CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_MANAGER2=y
# CONFIG_CRYPTO_GF128MUL is not set
# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
# CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
#
CONFIG_CRYPTO_AES=y
# CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
# CONFIG_CRYPTO_ANUBIS is not set
CONFIG_CRYPTO_ARC4=y
# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
# Compression
#
# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
# CONFIG_CRYPTO_LZO is not set
#
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set
CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
CONFIG_VIRTUALIZATION=y
# CONFIG_KVM is not set
# CONFIG_VIRTIO_PCI is not set
# CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
#
# Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
# CONFIG_CRC7 is not set
# CONFIG_LIBCRC32C is not set
CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e..e590261ba05 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
.quad compat_sys_signalfd4
.quad sys_eventfd2
.quad sys_epoll_create1
- .quad sys_dup3 /* 330 */
+ .quad sys_dup3 /* 330 */
.quad sys_pipe2
.quad sys_inotify_init1
.quad compat_sys_preadv
.quad compat_sys_pwritev
+ .quad compat_sys_rt_tgsigqueueinfo /* 335 */
+ .quad sys_perf_counter_open
ia32_syscall_end:
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf7..1a37bcdc860 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
+#include <linux/stringify.h>
#include <asm/asm.h>
/*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
const unsigned char *const *find_nop_table(void);
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature) \
+ \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".section .altinstructions,\"a\"\n" \
+ _ASM_ALIGN "\n" \
+ _ASM_PTR "661b\n" /* label */ \
+ _ASM_PTR "663f\n" /* new instruction */ \
+ " .byte " __stringify(feature) "\n" /* feature bit */ \
+ " .byte 662b-661b\n" /* sourcelen */ \
+ " .byte 664f-663f\n" /* replacementlen */ \
+ ".previous\n" \
+ ".section .altinstr_replacement, \"ax\"\n" \
+ "663:\n\t" newinstr "\n664:\n" /* replacement */ \
+ ".previous"
+
/*
* Alternative instructions for different CPU types or capabilities.
*
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
* without volatile and memory clobber.
*/
#define alternative(oldinstr, newinstr, feature) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c0\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" :: "i" (feature) : "memory")
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
/*
* Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
* Best is to use constraints that are fixed size (like (%1) ... "r")
* If you use variable sized constraints like "m" or "g" in the
* replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
*/
#define alternative_input(oldinstr, newinstr, feature, input...) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c0\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" :: "i" (feature), ##input)
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : : "i" (0), ## input)
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
- asm volatile ("661:\n\t" oldinstr "\n662:\n" \
- ".section .altinstructions,\"a\"\n" \
- _ASM_ALIGN "\n" \
- _ASM_PTR "661b\n" /* label */ \
- _ASM_PTR "663f\n" /* new instruction */ \
- " .byte %c[feat]\n" /* feature bit */ \
- " .byte 662b-661b\n" /* sourcelen */ \
- " .byte 664f-663f\n" /* replacementlen */ \
- ".previous\n" \
- ".section .altinstr_replacement,\"ax\"\n" \
- "663:\n\t" newinstr "\n664:\n" /* replacement */ \
- ".previous" : output : [feat] "i" (feature), ##input)
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
+ : output : "i" (0), ## input)
/*
* use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329b..262e0282004 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
extern int amd_iommu_init_dma_ops(void);
extern void amd_iommu_detect(void);
extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
#else
static inline int amd_iommu_init(void) { return -ENODEV; }
static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b..0c878caaa0a 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,27 @@
#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...) \
+ do { \
+ if (amd_iommu_dump) \
+ printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+ } while(0);
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+ list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+ list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT 27 /* 128 MB */
+#define APERTURE_RANGE_SIZE (1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES 32 /* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
/*
* This structure contains generic data for IOMMU protection domains
@@ -210,6 +231,26 @@ struct protection_domain {
};
/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+ /* address allocation bitmap */
+ unsigned long *bitmap;
+
+ /*
+ * Array of PTE pages for the aperture. In this array we save all the
+ * leaf pages of the domain page table used for the aperture. This way
+ * we don't need to walk the page table to find a specific PTE. We can
+ * just calculate its address in constant time.
+ */
+ u64 *pte_pages[64];
+
+ unsigned long offset;
+};
+
+/*
* Data container for a dma_ops specific protection domain
*/
struct dma_ops_domain {
@@ -222,18 +263,10 @@ struct dma_ops_domain {
unsigned long aperture_size;
/* address we start to search for free addresses */
- unsigned long next_bit;
-
- /* address allocation bitmap */
- unsigned long *bitmap;
+ unsigned long next_address;
- /*
- * Array of PTE pages for the aperture. In this array we save all the
- * leaf pages of the domain page table used for the aperture. This way
- * we don't need to walk the page table to find a specific PTE. We can
- * just calculate its address in constant time.
- */
- u64 **pte_pages;
+ /* address space relevant data */
+ struct aperture_range *aperture[APERTURE_MAX_RANGES];
/* This will be set to true when TLB needs to be flushed */
bool need_flush;
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f837742..bb7d4792584 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);
-#define EIM_8BIT_APIC_ID 0
-#define EIM_32BIT_APIC_ID 1
+extern int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
/*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
return val;
}
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
extern void check_x2apic(void);
extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
static inline int x2apic_enabled(void)
{
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
return 1;
return 0;
}
+
+#define x2apic_supported() (cpu_has_x2apic)
#else
static inline void check_x2apic(void)
{
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
static inline void enable_x2apic(void)
{
}
-static inline void enable_IR_x2apic(void)
-{
-}
static inline int x2apic_enabled(void)
{
return 0;
}
-#define x2apic 0
-
+#define x2apic_preenabled 0
+#define x2apic_supported() 0
#endif
-extern int get_physical_broadcast(void);
+extern void enable_IR_x2apic(void);
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
- /* Docs say use 0 for future compatibility */
- native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
+extern int get_physical_broadcast(void);
+extern void apic_disable(void);
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
#define local_apic_timer_c2_ok 1
static inline void init_apic_mappings(void) { }
static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
#endif /* !CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
- if (APIC_XAPIC(ver))
+ if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
return (x >> 24) & 0xFF;
else
return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
extern void default_setup_apic_routing(void);
#ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
/*
* Set up the logical destination ID.
*
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b1..7ddb36ab933 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
# define APIC_INTEGRATED(x) (1)
#endif
#define APIC_XAPIC(x) ((x) >= 0x14)
+#define APIC_EXT_SPACE(x) ((x) & 0x80000000)
#define APIC_TASKPRI 0x80
#define APIC_TPRI_MASK 0xFFu
#define APIC_ARBPRI 0x90
@@ -116,7 +117,9 @@
#define APIC_TDR_DIV_32 0x8
#define APIC_TDR_DIV_64 0x9
#define APIC_TDR_DIV_128 0xA
-#define APIC_EILVT0 0x500
+#define APIC_EFEAT 0x400
+#define APIC_ECTRL 0x410
+#define APIC_EILVTn(n) (0x500 + 0x10 * n)
#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
#define APIC_EILVT_NR_AMD_10H 4
#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
#define APIC_EILVT_MSG_NMI 0x4
#define APIC_EILVT_MSG_EXT 0x7
#define APIC_EILVT_MASKED (1 << 16)
-#define APIC_EILVT1 0x510
-#define APIC_EILVT2 0x520
-#define APIC_EILVT3 0x530
#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
#define APIC_BASE_MSR 0x800
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba422..aff9f1fcdcd 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
#define smp_mb__before_atomic_inc() barrier()
#define smp_mb__after_atomic_inc() barrier()
+/* An 64bit atomic type */
+
+typedef struct {
+ unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val) { (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr) ((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+ asm volatile(
+
+ LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+ : "=A" (old)
+
+ : [ptr] "D" (ptr),
+ "A" (old),
+ "b" (ll_low(new)),
+ "c" (ll_high(new))
+
+ : "memory");
+
+ return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+ unsigned long long new_val)
+{
+ return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_xchg - xchg atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ * @old_val: old value that was there
+ *
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
+ */
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
+{
+ unsigned long long old_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+ return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ * @new_val: value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+ atomic64_xchg(ptr, new_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+ unsigned long long curr_val;
+
+ do {
+ curr_val = __atomic64_read(ptr);
+ } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+ return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val, new_val;
+
+ do {
+ old_val = atomic_read(ptr);
+ new_val = old_val + delta;
+
+ } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+ return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+ return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+ return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+ return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+ atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+ unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+ return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+ atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+ atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+ return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+ long long old_val = atomic64_add_return(delta, ptr);
+
+ return old_val < 0;
+}
+
#include <asm-generic/atomic.h>
#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc9..418e632d4a8 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
#ifdef __KERNEL__
+#include <asm/page_types.h>
+
/* Physical address where kernel should be loaded. */
#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
+ (CONFIG_PHYSICAL_ALIGN - 1)) \
& ~(CONFIG_PHYSICAL_ALIGN - 1))
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_x86_64
+#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1)
+#endif
+#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+ (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
#ifdef CONFIG_KERNEL_BZIP2
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b..1724e8de317 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
__u32 ramdisk_size;
__u32 bootsect_kludge;
__u16 heap_end_ptr;
- __u16 _pad1;
+ __u8 ext_loader_ver;
+ __u8 ext_loader_type;
__u32 cmd_line_ptr;
__u32 initrd_addr_max;
__u32 kernel_alignment;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa..d96c1ee3a95 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
CPU_VALUE_BIT, /* value */
};
-#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel Processor Families/Processor Number Series
- * -------------------------- ------------------------------------------
- * 05_01, 05_02, 05_04 Pentium, Pentium with MMX
- *
- * 06_01 Pentium Pro
- * 06_03, 06_05 Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B Pentium III Xeon, Pentum III
- *
- * 06_09, 060D Pentium M
- *
- * 06_0E Core Duo, Core Solo
- *
- * 06_0F Xeon 3000, 3200, 5100, 5300, 7300 series,
- * Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- * Pentium dual-core
- * 06_17 Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C Atom
- *
- * 0F_00, 0F_01, 0F_02 Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04 Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06 Xeon 7100, 5000 Series, Xeon MP,
- * Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
- CPU_NONE,
-/* Intel */
- CPU_INTEL_PENTIUM_BIT,
- CPU_INTEL_P6_BIT,
- CPU_INTEL_PENTIUM_M_BIT,
- CPU_INTEL_CORE_BIT,
- CPU_INTEL_CORE2_BIT,
- CPU_INTEL_ATOM_BIT,
- CPU_INTEL_XEON_P4_BIT,
- CPU_INTEL_XEON_MP_BIT,
-/* AMD */
- CPU_AMD_K6_BIT,
- CPU_AMD_K7_BIT,
- CPU_AMD_K8_BIT,
- CPU_AMD_0F_BIT,
- CPU_AMD_10_BIT,
- CPU_AMD_11_BIT,
-};
-
-#define CPU_INTEL_PENTIUM (1 << CPU_INTEL_PENTIUM_BIT)
-#define CPU_INTEL_P6 (1 << CPU_INTEL_P6_BIT)
-#define CPU_INTEL_PENTIUM_M (1 << CPU_INTEL_PENTIUM_M_BIT)
-#define CPU_INTEL_CORE (1 << CPU_INTEL_CORE_BIT)
-#define CPU_INTEL_CORE2 (1 << CPU_INTEL_CORE2_BIT)
-#define CPU_INTEL_ATOM (1 << CPU_INTEL_ATOM_BIT)
-#define CPU_INTEL_XEON_P4 (1 << CPU_INTEL_XEON_P4_BIT)
-#define CPU_INTEL_XEON_MP (1 << CPU_INTEL_XEON_MP_BIT)
-
-#define CPU_INTEL_PX (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define CPU_INTEL_COREX (CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define CPU_INTEL_XEON (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define CPU_CO_AT (CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define CPU_C2_AT (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define CPU_CX_AT (CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define CPU_CX_XE (CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define CPU_P6_XE (CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define CPU_PM_CO_AT (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define CPU_C2_AT_XE (CPU_C2_AT | CPU_INTEL_XEON)
-#define CPU_CX_AT_XE (CPU_CX_AT | CPU_INTEL_XEON)
-#define CPU_P6_CX_AT (CPU_INTEL_P6 | CPU_CX_AT)
-#define CPU_P6_CX_XE (CPU_P6_XE | CPU_INTEL_COREX)
-#define CPU_P6_CX_AT_XE (CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define CPU_PM_CX_AT_XE (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define CPU_PM_CX_AT (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define CPU_PM_CX_XE (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define CPU_PX_CX_AT (CPU_INTEL_PX | CPU_CX_AT)
-#define CPU_PX_CX_AT_XE (CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define CPU_INTEL_ALL (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define CPU_AMD_K6 (1 << CPU_AMD_K6_BIT)
-#define CPU_AMD_K7 (1 << CPU_AMD_K7_BIT)
-#define CPU_AMD_K8 (1 << CPU_AMD_K8_BIT)
-#define CPU_AMD_0F (1 << CPU_AMD_0F_BIT)
-#define CPU_AMD_10 (1 << CPU_AMD_10_BIT)
-#define CPU_AMD_11 (1 << CPU_AMD_11_BIT)
-
-#define CPU_K10_PLUS (CPU_AMD_10 | CPU_AMD_11)
-#define CPU_K0F_PLUS (CPU_AMD_0F | CPU_K10_PLUS)
-#define CPU_K8_PLUS (CPU_AMD_K8 | CPU_K0F_PLUS)
-#define CPU_K7_PLUS (CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define CPU_AMD_ALL (CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define CPU_ALL (CPU_INTEL_ALL | CPU_AMD_ALL)
+#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
#define MAX_CPU_FILES 512
@@ -220,7 +122,6 @@ struct cpu_debug_range {
unsigned min; /* Register range min */
unsigned max; /* Register range max */
unsigned flag; /* Supported flags */
- unsigned model; /* Supported models */
};
#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397a..19af42138f7 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -192,11 +193,11 @@ extern const char * const x86_power_flags[32];
#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
#define setup_clear_cpu_cap(bit) do { \
clear_cpu_cap(&boot_cpu_data, bit); \
- set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+ set_bit(bit, (unsigned long *)cpu_caps_cleared); \
} while (0)
#define setup_force_cpu_cap(bit) do { \
set_cpu_cap(&boot_cpu_data, bit); \
- clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
+ set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0)
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672ba100..70dac199b09 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
* - buffer allocation (memory accounting)
*
*
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
#ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
* The interrupt threshold is independent from the overflow callback
* to allow users to use their own overflow interrupt handling mechanism.
*
- * task: the task to request recording for;
- * NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu: the cpu to request recording for
* base: the base pointer for the (non-pageable) buffer;
* size: the size of the provided buffer in bytes
* ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
* -1 if no interrupt threshold is requested.
* flags: a bit-mask of the above flags
*/
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags);
/*
* Release BTS or PEBS resources
* Suspend and resume BTS or PEBS tracing
*
+ * Must be called with irq's enabled.
+ *
* tracer: the tracer handle returned from ds_request_~()
*/
extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
extern void ds_suspend_pebs(struct pebs_tracer *tracer);
extern void ds_resume_pebs(struct pebs_tracer *tracer);
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
/*
* The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
} lbr;
/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
struct {
- __u64 jiffies;
+ __u64 clock;
pid_t pid;
- } timestamp;
+ } event;
} variant;
};
@@ -201,8 +234,12 @@ struct bts_trace {
struct pebs_trace {
struct ds_trace ds;
- /* the PEBS reset value */
- unsigned long long reset_value;
+ /* the number of valid counters in the below array */
+ unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+ /* the counter reset value */
+ unsigned long long counter_reset[MAX_PEBS_COUNTERS];
};
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
* Returns 0 on success; -Eerrno on error
*
* tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
* value: the new counter reset value
*/
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+ unsigned int counter, u64 value);
/*
* Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
*/
extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
#else /* CONFIG_X86_DS */
struct cpuinfo_x86;
static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
static inline void ds_switch_to(struct task_struct *prev,
struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
- struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
#endif /* CONFIG_X86_DS */
#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf25..d750a10ccad 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
#ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
#endif
#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f98..9ebc5c25503 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
unsigned int irq_spurious_count;
#endif
unsigned int generic_irqs; /* arch dependent */
+ unsigned int apic_perf_irqs;
+ unsigned int apic_pending_irqs;
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd7..6df45f63966 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,8 @@
extern void apic_timer_interrupt(void);
extern void generic_interrupt(void);
extern void error_interrupt(void);
+extern void perf_pending_interrupt(void);
+
extern void spurious_interrupt(void);
extern void thermal_interrupt(void);
extern void reschedule_interrupt(void);
@@ -63,7 +65,26 @@ extern unsigned long io_apic_irqs;
extern void init_VISWS_APIC_irqs(void);
extern void setup_IO_APIC(void);
extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+
+struct io_apic_irq_attr {
+ int ioapic;
+ int ioapic_pin;
+ int trigger;
+ int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+ int ioapic, int ioapic_pin,
+ int trigger, int polarity)
+{
+ irq_attr->ioapic = ioapic;
+ irq_attr->ioapic_pin = ioapic_pin;
+ irq_attr->trigger = trigger;
+ irq_attr->polarity = polarity;
+}
+
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+ struct io_apic_irq_attr *irq_attr);
extern void setup_ioapic_dest(void);
extern void enable_IO_APIC(void);
@@ -78,7 +99,11 @@ extern void eisa_set_level_irq(unsigned int irq);
/* SMP */
extern void smp_apic_timer_interrupt(struct pt_regs *);
extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
#ifdef CONFIG_SMP
extern void smp_reschedule_interrupt(struct pt_regs *);
extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e518398..175adf58dd4 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
: [fx] "r" (fx), "m" (*fx), "0" (0));
#else
: [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return err;
}
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
- if (task_thread_info(tsk)->status & TS_XSAVE)
- return xrstor_checking(&tsk->thread.xstate->xsave);
- else
- return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
: [fx] "r" (fx), "0" (0));
#else
: [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
asm volatile("fnclex ; fwait");
}
-static inline void restore_fpu(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
- if (task_thread_info(tsk)->status & TS_XSAVE) {
- xrstor_checking(&tsk->thread.xstate->xsave);
- return;
- }
/*
* The "nop" is needed to make the instructions the same
* length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
"nop ; frstor %1",
"fxrstor %1",
X86_FEATURE_FXSR,
- "m" (tsk->thread.xstate->fxsave));
+ "m" (*fx));
+
+ return 0;
}
/* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
#endif /* CONFIG_X86_64 */
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+ if (task_thread_info(tsk)->status & TS_XSAVE)
+ return xrstor_checking(&tsk->thread.xstate->xsave);
+ else
+ return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
/*
* Signal frame handlers...
*/
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
/*
* Some instructions like VIA's padlock instructions generate a spurious
* DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
* should use them only in the context of irq_ts_save/restore()
*/
static inline int irq_ts_save(void)
{
/*
- * If we are in process context, we are ok to take a spurious DNA fault.
- * Otherwise, doing clts() in process context require pre-emption to
- * be disabled or some heavy lifting like kernel_fpu_begin()
+ * If in process context and not atomic, we can take a spurious DNA fault.
+ * Otherwise, doing clts() in process context requires disabling preemption
+ * or some heavy lifting like kernel_fpu_begin()
*/
- if (!in_interrupt())
+ if (!in_atomic())
return 0;
if (read_cr0() & X86_CR0_TS) {
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092a..58d7091eeb1 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
extern void mask_8259A(void);
extern void unmask_8259A(void);
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2..00000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
-
-#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
-
-#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
-
-#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
-
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
- (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
-
-union cpuid10_eax {
- struct {
- unsigned int version_id:8;
- unsigned int num_counters:8;
- unsigned int bit_width:8;
- unsigned int mask_length:8;
- } split;
- unsigned int full;
-};
-
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e43601..daf866ed061 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,22 +154,19 @@ extern int timer_through_8259;
extern int io_apic_get_unique_id(int ioapic, int apic_id);
extern int io_apic_get_version(int ioapic);
extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
- int edge_level, int active_high_low);
#endif /* CONFIG_ACPI */
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr);
extern int (*ioapic_renumber_irq)(int ioapic, int irq);
extern void ioapic_init_mappings(void);
-#ifdef CONFIG_X86_64
extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
- struct IO_APIC_route_entry **ioapic_entries);
-#endif
extern void probe_nr_irqs_gsi(void);
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6..0e9fe1d9d97 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
/*
* Copyright © 2008 Ingo Molnar
*
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
void
iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb..f275e224450 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
#ifndef _ASM_X86_IRQ_REMAPPING_H
#define _ASM_X86_IRQ_REMAPPING_H
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47..e997be98c9b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,6 +34,7 @@
#ifdef CONFIG_X86_32
# define SYSCALL_VECTOR 0x80
+# define IA32_SYSCALL_VECTOR 0x80
#else
# define IA32_SYSCALL_VECTOR 0x80
#endif
@@ -107,14 +108,14 @@
#define LOCAL_TIMER_VECTOR 0xef
/*
- * Performance monitoring interrupt vector:
+ * Generic system vector for platform specific use
*/
-#define LOCAL_PERF_VECTOR 0xee
+#define GENERIC_INTERRUPT_VECTOR 0xed
/*
- * Generic system vector for platform specific use
+ * Performance monitoring pending work vector:
*/
-#define GENERIC_INTERRUPT_VECTOR 0xed
+#define LOCAL_PENDING_VECTOR 0xec
/*
* First APIC vector available to drivers: (vectors 0x30-0xee) we
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24..c2d1f3b58e5 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_scan_nodes(unsigned long start, unsigned long end);
+#ifdef CONFIG_K8_NB
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+ return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
+#else
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+ return NULL;
+}
+#endif
+
+
#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c..ef51b501e22 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
struct device;
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
struct microcode_ops {
- int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
- int (*request_microcode_fw) (int cpu, struct device *device);
+ enum ucode_state (*request_microcode_user) (int cpu,
+ const void __user *buf, size_t size);
- void (*apply_microcode) (int cpu);
+ enum ucode_state (*request_microcode_fw) (int cpu,
+ struct device *device);
- int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
void (*microcode_fini_cpu) (int cpu);
+
+ /*
+ * The generic 'microcode_core' part guarantees that
+ * the callbacks below run on a target cpu when they
+ * are being called.
+ * See also the "Synchronization" section in microcode_core.c.
+ */
+ int (*apply_microcode) (int cpu);
+ int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
};
struct ucode_cpu_info {
- struct cpu_signature cpu_sig;
- int valid;
- void *mc;
+ struct cpu_signature cpu_sig;
+ int valid;
+ void *mc;
};
extern struct ucode_cpu_info ucode_cpu_info[];
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cd..e2a1bb6d71e 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
#ifdef CONFIG_X86_MPPARSE
extern void find_smp_config(void);
extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
#else
static inline void find_smp_config(void) { }
static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
#endif
void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
u32 gsi);
extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+ int active_high_low);
extern int acpi_probe_gsi(void);
#ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity);
extern int mp_find_ioapic(int gsi);
extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity)
-{
- return 0;
-}
#endif
#else /* !CONFIG_ACPI: */
static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c16..4d58d04fca8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
#define MSR_K8_TOP_MEM1 0xc001001a
#define MSR_K8_TOP_MEM2 0xc001001d
#define MSR_K8_SYSCFG 0xc0010010
-#define MSR_K8_HWCR 0xc0010015
#define MSR_K8_INT_PENDING_MSG 0xc0010055
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568df..c9726440993 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
* but since they are power of two we could use a
* cheaper way --cvg
*/
- return nmi_watchdog & 0x3;
+ return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
}
#endif
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cb..c4ae822e415 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
extern void numa_init_array(void);
extern int numa_off;
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
extern s16 apicid_to_node[MAX_LOCAL_APIC];
extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
unsigned long end);
#ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
extern void __init init_cpu_to_node(void);
extern void __cpuinit numa_set_node(int cpu, int node);
extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a..6f1b7331313 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
extern int sysctl_legacy_va_layout;
extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
extern void setup_bootmem_allocator(void);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b7024..8d382d3abf3 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,22 +32,14 @@
*/
#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
-#define __PHYSICAL_START CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN 0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
+ (CONFIG_PHYSICAL_ALIGN - 1)) & \
+ ~(CONFIG_PHYSICAL_ALIGN - 1))
#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 48
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
#define vmemmap ((struct page *)VMEMMAP_START)
-extern unsigned long init_memory_mapping(unsigned long start,
- unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006a..6473f5ccff8 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
+extern unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a53da004e08..4fb37c8a083 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
struct tss_struct;
struct mm_struct;
struct desc_struct;
+struct task_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
void (*swapgs)(void);
- struct pv_lazy_ops lazy_mode;
+ void (*start_context_switch)(struct task_struct *prev);
+ void (*end_context_switch)(struct task_struct *next);
};
struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
};
enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
void paravirt_enter_lazy_mmu(void);
void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
-#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+ PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
}
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(struct task_struct *next)
{
- PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+ PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
}
-void arch_flush_lazy_cpu_mode(void);
-
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 00000000000..876ed97147b
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
+
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC 8
+#define X86_PMC_MAX_FIXED 3
+
+#define X86_PMC_IDX_GENERIC 0
+#define X86_PMC_IDX_FIXED 32
+#define X86_PMC_IDX_MAX 64
+
+#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
+
+#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
+
+#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
+
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK 0xffff
+
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
+ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
+union cpuid10_eax {
+ struct {
+ unsigned int version_id:8;
+ unsigned int num_counters:8;
+ unsigned int bit_width:8;
+ unsigned int mask_length:8;
+ } split;
+ unsigned int full;
+};
+
+union cpuid10_edx {
+ struct {
+ unsigned int num_counters_fixed:4;
+ unsigned int reserved:28;
+ } split;
+ unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
+
+extern void set_perf_counter_pending(void);
+
+#define clear_perf_counter_pending() do { } while (0)
+#define test_perf_counter_pending() (0)
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(void);
+#else
+static inline void init_hw_perf_counters(void) { }
+static inline void perf_counters_lapic_init(void) { }
+#endif
+
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc..18ef7ebf263 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
#define pte_val(x) native_pte_val(x)
#define __pte(x) native_make_pte(x)
+#define arch_end_context_switch(prev) do {} while(0)
+
#endif /* CONFIG_PARAVIRT */
/*
@@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd)
#ifndef __ASSEMBLY__
+extern int direct_gbpages;
+
/* local pte updates need not use xchg for locking */
static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d501..abde308fdb0 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
extern void paging_init(void);
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
#define pte_ERROR(e) \
printk("%s:%d: bad pte %p(%016lx).\n", \
__FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define update_mmu_cache(vma, address, pte) do { } while (0)
-extern int direct_gbpages;
-
/* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e038..766ea16fbbb 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START _AC(0xffffc20000000000, UL)
-#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START _AC(0xffffe20000000000, UL)
+#define VMALLOC_START _AC(0xffffc90000000000, UL)
+#define VMALLOC_END _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START _AC(0xffffea0000000000, UL)
#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786..4d258ad76a0 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
extern pteval_t __supported_pte_mask;
extern int nx_enabled;
-extern void set_nx(void);
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c..c7768269b1c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss;
-extern __u32 cleared_cpu_caps[NCAPINTS];
+extern __u32 cpu_caps_cleared[NCAPINTS];
+extern __u32 cpu_caps_set[NCAPINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
extern unsigned int xstate_size;
extern void free_thread_xstate(struct task_struct *);
extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
struct thread_struct {
/* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
unsigned short fsindex;
unsigned short gsindex;
#endif
+#ifdef CONFIG_X86_32
unsigned long ip;
+#endif
+#ifdef CONFIG_X86_64
unsigned long fs;
+#endif
unsigned long gs;
/* Hardware debugging registers: */
unsigned long debugreg0;
@@ -460,14 +462,8 @@ struct thread_struct {
unsigned io_bitmap_max;
/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
unsigned long debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+ /* Debug Store context; see asm/ds.h */
struct ds_context *ds_ctx;
-#endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
- unsigned int bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
};
static inline unsigned long native_get_debugreg(int regno)
@@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
return debugctlmsr;
}
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+ u64 debugctlmsr = 0;
+ u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return 0;
+#endif
+ rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+ debugctlmsr = val1 | ((u64)val2 << 32);
+
+ return debugctlmsr;
+}
+
static inline void update_debugctlmsr(unsigned long debugctlmsr)
{
#ifndef CONFIG_X86_DEBUGCTLMSR
@@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
}
+static inline void update_debugctlmsr_on_cpu(int cpu,
+ unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+ if (boot_cpu_data.x86 < 6)
+ return;
+#endif
+ wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+ (u32)((u64)debugctlmsr),
+ (u32)((u64)debugctlmsr >> 32));
+}
+
/*
* from system description table in BIOS. Mostly for MCA use, but
* others may find it useful:
@@ -814,6 +837,7 @@ extern unsigned int BIOS_revision;
/* Boot loader type from the setup header: */
extern int bootloader_type;
+extern int bootloader_version;
extern char ignore_fpu_irq;
@@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
- .fs = __KERNEL_PERCPU, \
}
/*
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 624f133943e..0f0d908349a 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
extern int do_set_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info, int can_allocate);
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
- unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd5..64cf2d24fad 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
#endif
#ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
#define NEED_PSE 0
-#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
#define NEED_PGE 0
+#else
+#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae..4093d1ed6db 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
int (*setup_ioapic_ids)(void);
};
-extern void x86_quirk_pre_intr_init(void);
extern void x86_quirk_intr_init(void);
extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966..6a84ed166ae 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
static inline int logical_smp_processor_id(void)
{
/* we don't want to mark this access volatile - bad code generation */
- return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+ return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
}
#endif
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec..4517d6b9318 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS 44
-# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS 46
#endif
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f690..372b76edd63 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
/*
* syscalls.h - Linux syscall interfaces (arch-specific)
*
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
*
* This file is released under the GPLv2.
* See the file COPYING for more details.
@@ -12,50 +12,55 @@
#include <linux/compiler.h>
#include <linux/linkage.h>
-#include <linux/types.h>
#include <linux/signal.h>
+#include <linux/types.h>
/* Common in X86_32 and X86_64 */
/* kernel/ioport.c */
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
/* kernel/tls.c */
asmlinkage int sys_set_thread_area(struct user_desc __user *);
asmlinkage int sys_get_thread_area(struct user_desc __user *);
/* X86_32 only */
#ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
/* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
int sys_execve(struct pt_regs *);
-/* kernel/signal_32.c */
+/* kernel/signal.c */
asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
struct old_sigaction __user *);
int sys_sigaltstack(struct pt_regs *);
unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
/* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
asmlinkage int old_select(struct sel_arg_struct __user *);
asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
asmlinkage int sys_olduname(struct oldold_utsname __user *);
/* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
#else /* CONFIG_X86_32 */
/* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
/* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
asmlinkage long sys_clone(unsigned long, unsigned long,
void __user *, void __user *,
struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
asmlinkage long sys_execve(char __user *, char __user * __user *,
char __user * __user *,
struct pt_regs *);
long sys_arch_prctl(int, unsigned long);
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
/* kernel/sys_x86_64.c */
+struct new_utsname;
+
asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
-struct new_utsname;
asmlinkage long sys_uname(struct new_utsname __user *);
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8820a73ae09..602c769fc98 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,7 +94,8 @@ struct thread_info {
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
-#define TIF_SYSCALL_FTRACE 27 /* for ftrace syscall instrumentation */
+#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
+#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */
#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE)
/* work to do in syscall_trace_enter() */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 16a5c84b032..a5ecc9c33e9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
static inline void __native_flush_tlb(void)
{
- write_cr3(read_cr3());
+ native_write_cr3(native_read_cr3());
}
static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
*/
raw_local_irq_save(flags);
- cr4 = read_cr4();
+ cr4 = native_read_cr4();
/* clear PGE */
- write_cr4(cr4 & ~X86_CR4_PGE);
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
- write_cr4(cr4);
+ native_write_cr4(cr4);
raw_local_irq_restore(flags);
}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca4..066ef590d7e 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
void x86_pci_root_bus_res_quirks(struct pci_bus *b);
#ifdef CONFIG_SMP
-#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
+ (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
#define smt_capable() (smp_num_siblings > 1)
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b8..bfd74c032fc 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
#define _ASM_X86_TRAPS_H
#include <asm/debugreg.h>
+#include <asm/siginfo.h> /* TRAP_TRACE, ... */
#ifdef CONFIG_X86_32
#define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
}
extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
void math_error(void __user *);
void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8d..732a3070615 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
#define __NR_inotify_init1 332
#define __NR_preadv 333
#define __NR_pwritev 334
+#define __NR_rt_tgsigqueueinfo 335
+#define __NR_perf_counter_open 336
#ifdef __KERNEL__
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f8182946232..900e1617e67 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
__SYSCALL(__NR_preadv, sys_preadv)
#define __NR_pwritev 296
__SYSCALL(__NR_pwritev, sys_pwritev)
-
+#define __NR_rt_tgsigqueueinfo 297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
+#define __NR_perf_counter_open 298
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a8..bddd44f2f0a 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
+#define UV_ADP_SIZE 32
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
#define UV_NET_ENDPOINT_INTD 0x38
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062..341070f7ad5 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
struct uv_hub_info_s {
unsigned long global_mmr_base;
unsigned long gpa_mask;
+ unsigned int gnode_extra;
unsigned long gnode_upper;
unsigned long lowmem_remap_top;
unsigned long lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
* p - PNODE (local part of nsids, right shifted 1)
*/
#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
#define UV_LOCAL_MMR_BASE 0xf4000000UL
#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
- ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+ ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
#define UV_APIC_PNODE_SHIFT 6
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 88d1bfc847d..4f78bd68212 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o := $(nostackp)
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y += setup.o i8259.o irqinit_$(BITS).o
+obj-y += setup.o i8259.o irqinit.o
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
obj-$(CONFIG_X86_32) += probe_roms_32.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +44,7 @@ obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
obj-$(CONFIG_X86_DS) += ds.o
+obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f80..631086159c5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
+#include <linux/pci.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
* success: return IRQ number (>=0)
* failure: return < 0
*/
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
unsigned int irq;
unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- if (triggering == ACPI_LEVEL_SENSITIVE)
+ if (trigger == ACPI_LEVEL_SENSITIVE)
eisa_set_level_irq(gsi);
}
#endif
#ifdef CONFIG_X86_IO_APIC
if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
- plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+ plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
#endif
static struct {
- int apic_id;
int gsi_base;
int gsi_end;
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
} mp_ioapic_routing[MAX_IO_APICS];
int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].apicver = 0;
-#endif
+
/*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
int ioapic;
- int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ u8 pin;
- static int pci_irq = IRQ_COMPRESSION_START;
- /*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
- */
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev)
+ return 0;
+ if (dev->bus != &pci_bus_type)
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ save_mp_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+ struct io_apic_irq_attr irq_attr;
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
-#endif
/* Don't set up the ACPI SCI because it's already set up */
if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
gsi = ioapic_renumber_irq(ioapic, gsi);
#endif
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ "%d-%d\n", mp_ioapics[ioapic].apicid,
ioapic_pin);
return gsi;
}
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
- }
-
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
- /*
- * For GSI >= 64, use IRQ compression
- */
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
-}
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
- u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- int ioapic;
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
- if (!acpi_ioapic)
- return 0;
+ set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+ trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ io_apic_set_pci_routing(dev, gsi, &irq_attr);
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- save_mp_irq(&mp_irq);
-#endif
- return 0;
+ return gsi;
}
/*
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9de..167bc16ce0e 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
always := wakeup.bin
targets := wakeup.elf wakeup.lds
-wakeup-y += wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
# The link order of the video-*.o modules can matter. In particular,
# video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 00000000000..f51eb0bb56c
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 00000000000..6206033ba20
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad5..1c60554537c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,16 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
static struct dma_ops_domain *find_protection_domain(u16 devid);
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64
+ **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages);
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
#ifdef CONFIG_AMD_IOMMU_STATS
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list)
+ for_each_iommu(iommu)
iommu_poll_events(iommu);
return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
domid, 1, 1);
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
spin_lock_irqsave(&iommu->lock, flags);
__iommu_queue_command(iommu, &cmd);
__iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
}
}
+void amd_iommu_flush_all_domains(void)
+{
+ int i;
+
+ for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+ if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+ continue;
+ iommu_flush_domain(i);
+ }
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+ struct amd_iommu *iommu;
+ int i;
+
+ for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+ if (amd_iommu_pd_table[i] == NULL)
+ continue;
+
+ iommu = amd_iommu_rlookup_table[i];
+ if (!iommu)
+ continue;
+
+ iommu_queue_inv_dev_entry(iommu, i);
+ iommu_completion_wait(iommu);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long phys_addr,
int prot)
{
- u64 __pte, *pte, *page;
+ u64 __pte, *pte;
bus_addr = PAGE_ALIGN(bus_addr);
phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L2_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
- if (!IOMMU_PTE_PRESENT(*pte)) {
- page = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
- *pte = IOMMU_L1_PDE(virt_to_phys(page));
- }
-
- pte = IOMMU_PTE_PAGE(*pte);
- pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+ pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
if (IOMMU_PTE_PRESENT(*pte))
return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
* as allocated in the aperture
*/
if (addr < dma_dom->aperture_size)
- __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ __set_bit(addr >> PAGE_SHIFT,
+ dma_dom->aperture[0]->bitmap);
}
return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
****************************************************************************/
/*
- * The address allocator core function.
+ * The address allocator core functions.
*
* called with domain->lock held
*/
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+ unsigned long address)
+{
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return NULL;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ bool populate, gfp_t gfp)
+{
+ int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i;
+
+#ifdef CONFIG_IOMMU_STRESS
+ populate = false;
+#endif
+
+ if (index >= APERTURE_MAX_RANGES)
+ return -ENOMEM;
+
+ dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+ if (!dma_dom->aperture[index])
+ return -ENOMEM;
+
+ dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+ if (!dma_dom->aperture[index]->bitmap)
+ goto out_free;
+
+ dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+ if (populate) {
+ unsigned long address = dma_dom->aperture_size;
+ int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+ u64 *pte, *pte_page;
+
+ for (i = 0; i < num_ptes; ++i) {
+ pte = alloc_pte(&dma_dom->domain, address,
+ &pte_page, gfp);
+ if (!pte)
+ goto out_free;
+
+ dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+ address += APERTURE_RANGE_SIZE / 64;
+ }
+ }
+
+ dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+ /* Intialize the exclusion range if necessary */
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = iommu_num_pages(iommu->exclusion_start,
+ iommu->exclusion_length,
+ PAGE_SIZE);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ /*
+ * Check for areas already mapped as present in the new aperture
+ * range and mark those pages as reserved in the allocator. Such
+ * mappings may already exist as a result of requested unity
+ * mappings for devices.
+ */
+ for (i = dma_dom->aperture[index]->offset;
+ i < dma_dom->aperture_size;
+ i += PAGE_SIZE) {
+ u64 *pte = fetch_pte(&dma_dom->domain, i);
+ if (!pte || !IOMMU_PTE_PRESENT(*pte))
+ continue;
+
+ dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+ }
+
+ return 0;
+
+out_free:
+ free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+ kfree(dma_dom->aperture[index]);
+ dma_dom->aperture[index] = NULL;
+
+ return -ENOMEM;
+}
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages,
+ unsigned long align_mask,
+ u64 dma_mask,
+ unsigned long start)
+{
+ unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
+ int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+ int i = start >> APERTURE_RANGE_SHIFT;
+ unsigned long boundary_size;
+ unsigned long address = -1;
+ unsigned long limit;
+
+ next_bit >>= PAGE_SHIFT;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ for (;i < max_index; ++i) {
+ unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+ if (dom->aperture[i]->offset >= dma_mask)
+ break;
+
+ limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+ dma_mask >> PAGE_SHIFT);
+
+ address = iommu_area_alloc(dom->aperture[i]->bitmap,
+ limit, next_bit, pages, 0,
+ boundary_size, align_mask);
+ if (address != -1) {
+ address = dom->aperture[i]->offset +
+ (address << PAGE_SHIFT);
+ dom->next_address = address + (pages << PAGE_SHIFT);
+ break;
+ }
+
+ next_bit = 0;
+ }
+
+ return address;
+}
+
static unsigned long dma_ops_alloc_addresses(struct device *dev,
struct dma_ops_domain *dom,
unsigned int pages,
unsigned long align_mask,
u64 dma_mask)
{
- unsigned long limit;
unsigned long address;
- unsigned long boundary_size;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
- PAGE_SIZE) >> PAGE_SHIFT;
- limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
- dma_mask >> PAGE_SHIFT);
+#ifdef CONFIG_IOMMU_STRESS
+ dom->next_address = 0;
+ dom->need_flush = true;
+#endif
- if (dom->next_bit >= limit) {
- dom->next_bit = 0;
- dom->need_flush = true;
- }
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, dom->next_address);
- address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
- 0 , boundary_size, align_mask);
if (address == -1) {
- address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
- 0, boundary_size, align_mask);
+ dom->next_address = 0;
+ address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+ dma_mask, 0);
dom->need_flush = true;
}
- if (likely(address != -1)) {
- dom->next_bit = address + pages;
- address <<= PAGE_SHIFT;
- } else
+ if (unlikely(address == -1))
address = bad_dma_address;
WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
unsigned long address,
unsigned int pages)
{
- address >>= PAGE_SHIFT;
- iommu_area_free(dom->bitmap, address, pages);
+ unsigned i = address >> APERTURE_RANGE_SHIFT;
+ struct aperture_range *range = dom->aperture[i];
- if (address >= dom->next_bit)
+ BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
+
+#ifdef CONFIG_IOMMU_STRESS
+ if (i < 4)
+ return;
+#endif
+
+ if (address >= dom->next_address)
dom->need_flush = true;
+
+ address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
+ iommu_area_free(range->bitmap, address, pages);
+
}
/****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
unsigned long start_page,
unsigned int pages)
{
- unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+ unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
if (start_page + pages > last_page)
pages = last_page - start_page;
- iommu_area_reserve(dom->bitmap, start_page, pages);
+ for (i = start_page; i < start_page + pages; ++i) {
+ int index = i / APERTURE_RANGE_PAGES;
+ int page = i % APERTURE_RANGE_PAGES;
+ __set_bit(page, dom->aperture[index]->bitmap);
+ }
}
static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
*/
static void dma_ops_domain_free(struct dma_ops_domain *dom)
{
+ int i;
+
if (!dom)
return;
free_pagetable(&dom->domain);
- kfree(dom->pte_pages);
-
- kfree(dom->bitmap);
+ for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+ if (!dom->aperture[i])
+ continue;
+ free_page((unsigned long)dom->aperture[i]->bitmap);
+ kfree(dom->aperture[i]);
+ }
kfree(dom);
}
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
* It also intializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
- unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
{
struct dma_ops_domain *dma_dom;
- unsigned i, num_pte_pages;
- u64 *l2_pde;
- u64 address;
-
- /*
- * Currently the DMA aperture must be between 32 MB and 1GB in size
- */
- if ((order < 25) || (order > 30))
- return NULL;
dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
- dma_dom->aperture_size = (1ULL << order);
- dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
- GFP_KERNEL);
- if (!dma_dom->bitmap)
- goto free_dma_dom;
- /*
- * mark the first page as allocated so we never return 0 as
- * a valid dma-address. So we can use 0 as error value
- */
- dma_dom->bitmap[0] = 1;
- dma_dom->next_bit = 0;
dma_dom->need_flush = false;
dma_dom->target_dev = 0xffff;
- /* Intialize the exclusion range if necessary */
- if (iommu->exclusion_start &&
- iommu->exclusion_start < dma_dom->aperture_size) {
- unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
- int pages = iommu_num_pages(iommu->exclusion_start,
- iommu->exclusion_length,
- PAGE_SIZE);
- dma_ops_reserve_addresses(dma_dom, startpage, pages);
- }
+ if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+ goto free_dma_dom;
/*
- * At the last step, build the page tables so we don't need to
- * allocate page table pages in the dma_ops mapping/unmapping
- * path.
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
*/
- num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
- dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
- GFP_KERNEL);
- if (!dma_dom->pte_pages)
- goto free_dma_dom;
-
- l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (l2_pde == NULL)
- goto free_dma_dom;
+ dma_dom->aperture[0]->bitmap[0] = 1;
+ dma_dom->next_address = 0;
- dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
- for (i = 0; i < num_pte_pages; ++i) {
- dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
- if (!dma_dom->pte_pages[i])
- goto free_dma_dom;
- address = virt_to_phys(dma_dom->pte_pages[i]);
- l2_pde[i] = IOMMU_L1_PDE(address);
- }
return dma_dom;
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
struct protection_domain *domain;
struct dma_ops_domain *dma_domain;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
unsigned long flags;
if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
"to a non-dma-ops domain\n", dev_name(dev));
switch (action) {
- case BUS_NOTIFY_BOUND_DRIVER:
- if (domain)
- goto out;
- dma_domain = find_protection_domain(devid);
- if (!dma_domain)
- dma_domain = iommu->default_dom;
- attach_device(iommu, &dma_domain->domain, devid);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", dma_domain->domain.id, dev_name(dev));
- break;
- case BUS_NOTIFY_UNBIND_DRIVER:
+ case BUS_NOTIFY_UNBOUND_DRIVER:
if (!domain)
goto out;
detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
dma_domain = find_protection_domain(devid);
if (dma_domain)
goto out;
- dma_domain = dma_ops_domain_alloc(iommu, order);
+ dma_domain = dma_ops_domain_alloc(iommu);
if (!dma_domain)
goto out;
dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
attach_device(*iommu, *domain, *bdf);
- printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device %s\n", (*domain)->id, dev_name(dev));
+ DUMP_printk("Using protection domain %d for device %s\n",
+ (*domain)->id, dev_name(dev));
}
if (domain_for_device(_bdf) == NULL)
@@ -1144,6 +1277,66 @@ static int get_device_resources(struct device *dev,
}
/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+ unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+ u64 *pte, *page;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(gfp);
+ if (!page)
+ return NULL;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+
+ if (pte_page)
+ *pte_page = pte;
+
+ pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+ return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+ unsigned long address)
+{
+ struct aperture_range *aperture;
+ u64 *pte, *pte_page;
+
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return NULL;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte) {
+ pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+ aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+ } else
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ return pte;
+}
+
+/*
* This is the generic map function. It maps one 4kb page at paddr to
* the given address in the DMA address space for the domain.
*/
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
paddr &= PAGE_MASK;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
- pte += IOMMU_PTE_L0_INDEX(address);
+ pte = dma_ops_get_pte(dom, address);
+ if (!pte)
+ return bad_dma_address;
__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
struct dma_ops_domain *dom,
unsigned long address)
{
+ struct aperture_range *aperture;
u64 *pte;
if (address >= dom->aperture_size)
return;
- WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+ aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+ if (!aperture)
+ return;
+
+ pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+ if (!pte)
+ return;
- pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
pte += IOMMU_PTE_L0_INDEX(address);
WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
u64 dma_mask)
{
dma_addr_t offset = paddr & ~PAGE_MASK;
- dma_addr_t address, start;
+ dma_addr_t address, start, ret;
unsigned int pages;
unsigned long align_mask = 0;
int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
if (align)
align_mask = (1UL << get_order(size)) - 1;
+retry:
address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
dma_mask);
- if (unlikely(address == bad_dma_address))
- goto out;
+ if (unlikely(address == bad_dma_address)) {
+ /*
+ * setting next_address here will let the address
+ * allocator only scan the new allocated range in the
+ * first run. This is a small optimization.
+ */
+ dma_dom->next_address = dma_dom->aperture_size;
+
+ if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+ goto out;
+
+ /*
+ * aperture was sucessfully enlarged by 128 MB, try
+ * allocation again
+ */
+ goto retry;
+ }
start = address;
for (i = 0; i < pages; ++i) {
- dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ if (ret == bad_dma_address)
+ goto out_unmap;
+
paddr += PAGE_SIZE;
start += PAGE_SIZE;
}
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
out:
return address;
+
+out_unmap:
+
+ for (--i; i >= 0; --i) {
+ start -= PAGE_SIZE;
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ }
+
+ dma_ops_free_addresses(dma_dom, address, pages);
+
+ return bad_dma_address;
}
/*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address)
+ if (*dma_addr == bad_dma_address) {
+ spin_unlock_irqrestore(&domain->lock, flags);
goto out_free;
+ }
iommu_completion_wait(iommu);
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
iommu = amd_iommu_rlookup_table[devid];
if (!iommu)
continue;
- dma_dom = dma_ops_domain_alloc(iommu, order);
+ dma_dom = dma_ops_domain_alloc(iommu);
if (!dma_dom)
continue;
init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
int __init amd_iommu_init_dma_ops(void)
{
struct amd_iommu *iommu;
- int order = amd_iommu_aperture_order;
int ret;
/*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
* found in the system. Devices not assigned to any other
* protection domain will be assigned to the default one.
*/
- list_for_each_entry(iommu, &amd_iommu_list, list) {
- iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ for_each_iommu(iommu) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu);
if (iommu->default_dom == NULL)
return -ENOMEM;
iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
free_domains:
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
if (iommu->default_dom)
dma_ops_domain_free(iommu->default_dom);
}
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
old_domain = domain_for_device(devid);
if (old_domain)
- return -EBUSY;
+ detach_device(old_domain, devid);
attach_device(iommu, domain, devid);
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902da..238989ec077 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,15 +115,21 @@ struct ivmd_header {
u64 range_length;
} __attribute__((packed));
+bool amd_iommu_dump;
+
static int __initdata amd_iommu_detected;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
bool amd_iommu_isolate = true; /* if true, device isolation is
enabled */
+#endif
+
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
static inline unsigned long tbl_size(int entry_size)
{
unsigned shift = PAGE_SHIFT +
- get_order(amd_iommu_last_bdf * entry_size);
+ get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
return 1UL << shift;
}
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
* This function set the exclusion range in the IOMMU. DMA accesses to the
* exclusion range are passed through untranslated
*/
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
{
u64 start = iommu->exclusion_start & PAGE_MASK;
u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
}
/* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
{
u32 ctrl;
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
{
printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void iommu_disable(struct amd_iommu *iommu)
{
- iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
- iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+ iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
}
/*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
{
u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(CMD_BUFFER_SIZE));
- u64 entry;
if (cmd_buf == NULL)
return NULL;
iommu->cmd_buf_size = CMD_BUFFER_SIZE;
- entry = (u64)virt_to_phys(cmd_buf);
+ return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->cmd_buf == NULL);
+
+ entry = (u64)virt_to_phys(iommu->cmd_buf);
entry |= MMIO_CMD_SIZE_512;
+
memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
- &entry, sizeof(entry));
+ &entry, sizeof(entry));
/* set head and tail to zero manually */
writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
- return cmd_buf;
}
static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
/* allocates the memory where the IOMMU will log its events to */
static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
{
- u64 entry;
iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
get_order(EVT_BUFFER_SIZE));
if (iommu->evt_buf == NULL)
return NULL;
+ return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+ u64 entry;
+
+ BUG_ON(iommu->evt_buf == NULL);
+
entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
&entry, sizeof(entry));
- iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
- return iommu->evt_buf;
+ iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
}
static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
p += sizeof(struct ivhd_header);
end += h->length;
+
while (p < end) {
e = (struct ivhd_entry *)p;
switch (e->type) {
case IVHD_DEV_ALL:
+
+ DUMP_printk(" DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+ " last device %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(iommu->first_device),
+ PCI_SLOT(iommu->first_device),
+ PCI_FUNC(iommu->first_device),
+ PCI_BUS(iommu->last_device),
+ PCI_SLOT(iommu->last_device),
+ PCI_FUNC(iommu->last_device),
+ e->flags);
+
for (dev_i = iommu->first_device;
dev_i <= iommu->last_device; ++dev_i)
set_dev_entry_from_acpi(iommu, dev_i,
e->flags, 0);
break;
case IVHD_DEV_SELECT:
+
+ DUMP_printk(" DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
break;
case IVHD_DEV_SELECT_RANGE_START:
+
+ DUMP_printk(" DEV_SELECT_RANGE_START\t "
+ "devid: %02x:%02x.%x flags: %02x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = 0;
alias = false;
break;
case IVHD_DEV_ALIAS:
+
+ DUMP_printk(" DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+ "flags: %02x devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid = e->devid;
devid_to = e->ext >> 8;
- set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+ set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
amd_iommu_alias_table[devid] = devid_to;
break;
case IVHD_DEV_ALIAS_RANGE:
+
+ DUMP_printk(" DEV_ALIAS_RANGE\t\t "
+ "devid: %02x:%02x.%x flags: %02x "
+ "devid_to: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags,
+ PCI_BUS(e->ext >> 8),
+ PCI_SLOT(e->ext >> 8),
+ PCI_FUNC(e->ext >> 8));
+
devid_start = e->devid;
flags = e->flags;
devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
alias = true;
break;
case IVHD_DEV_EXT_SELECT:
+
+ DUMP_printk(" DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+ "flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid = e->devid;
set_dev_entry_from_acpi(iommu, devid, e->flags,
e->ext);
break;
case IVHD_DEV_EXT_SELECT_RANGE:
+
+ DUMP_printk(" DEV_EXT_SELECT_RANGE\t devid: "
+ "%02x:%02x.%x flags: %02x ext: %08x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid),
+ e->flags, e->ext);
+
devid_start = e->devid;
flags = e->flags;
ext_flags = e->ext;
alias = false;
break;
case IVHD_DEV_RANGE_END:
+
+ DUMP_printk(" DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+ PCI_BUS(e->devid),
+ PCI_SLOT(e->devid),
+ PCI_FUNC(e->devid));
+
devid = e->devid;
for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
{
struct amd_iommu *iommu, *next;
- list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ for_each_iommu_safe(iommu, next) {
list_del(&iommu->list);
free_iommu_one(iommu);
kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
if (!iommu->mmio_base)
return -ENOMEM;
- iommu_set_device_table(iommu);
iommu->cmd_buf = alloc_command_buffer(iommu);
if (!iommu->cmd_buf)
return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
h = (struct ivhd_header *)p;
switch (*p) {
case ACPI_IVHD_TYPE:
+
+ DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+ "seg: %d flags: %01x info %04x\n",
+ PCI_BUS(h->devid), PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid), h->cap_ptr,
+ h->pci_seg, h->flags, h->info);
+ DUMP_printk(" mmio-addr: %016llx\n",
+ h->mmio_phys);
+
iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
if (iommu == NULL)
return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
*
****************************************************************************/
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
- struct amd_iommu *curr;
- struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
- int nvec = 0, i;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev) {
- entries[nvec].entry = curr->evt_msi_num;
- entries[nvec].vector = 0;
- curr->int_enabled = true;
- nvec++;
- }
- }
-
- if (pci_enable_msix(iommu->dev, entries, nvec)) {
- pci_disable_msix(iommu->dev);
- return 1;
- }
-
- for (i = 0; i < nvec; ++i) {
- int r = request_irq(entries->vector, amd_iommu_int_handler,
- IRQF_SAMPLE_RANDOM,
- "AMD IOMMU",
- NULL);
- if (r)
- goto out_free;
- }
-
- return 0;
-
-out_free:
- for (i -= 1; i >= 0; --i)
- free_irq(entries->vector, NULL);
-
- pci_disable_msix(iommu->dev);
-
- return 1;
-}
-
static int __init iommu_setup_msi(struct amd_iommu *iommu)
{
int r;
- struct amd_iommu *curr;
-
- list_for_each_entry(curr, &amd_iommu_list, list) {
- if (curr->dev == iommu->dev)
- curr->int_enabled = true;
- }
-
if (pci_enable_msi(iommu->dev))
return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
return 1;
}
+ iommu->int_enabled = true;
+ iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
return 0;
}
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
{
if (iommu->int_enabled)
return 0;
- if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
- return iommu_setup_msix(iommu);
- else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+ if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
return iommu_setup_msi(iommu);
return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
static int __init init_unity_map_range(struct ivmd_header *m)
{
struct unity_map_entry *e = 0;
+ char *s;
e = kzalloc(sizeof(*e), GFP_KERNEL);
if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
switch (m->type) {
default:
+ kfree(e);
+ return 0;
case ACPI_IVMD_TYPE:
+ s = "IVMD_TYPEi\t\t\t";
e->devid_start = e->devid_end = m->devid;
break;
case ACPI_IVMD_TYPE_ALL:
+ s = "IVMD_TYPE_ALL\t\t";
e->devid_start = 0;
e->devid_end = amd_iommu_last_bdf;
break;
case ACPI_IVMD_TYPE_RANGE:
+ s = "IVMD_TYPE_RANGE\t\t";
e->devid_start = m->devid;
e->devid_end = m->aux;
break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
e->prot = m->flags >> 1;
+ DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+ " range_start: %016llx range_end: %016llx flags: %x\n", s,
+ PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+ PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+ PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+ e->address_start, e->address_end, m->flags);
+
list_add_tail(&e->list, &amd_iommu_unity_map);
return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
* This function finally enables all IOMMUs found in the system after
* they have been initialized
*/
-static void __init enable_iommus(void)
+static void enable_iommus(void)
{
struct amd_iommu *iommu;
- list_for_each_entry(iommu, &amd_iommu_list, list) {
+ for_each_iommu(iommu) {
+ iommu_set_device_table(iommu);
+ iommu_enable_command_buffer(iommu);
+ iommu_enable_event_buffer(iommu);
iommu_set_exclusion_range(iommu);
iommu_init_msi(iommu);
- iommu_enable_event_logging(iommu);
iommu_enable(iommu);
}
}
+static void disable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ for_each_iommu(iommu)
+ iommu_disable(iommu);
+}
+
/*
* Suspend/Resume support
* disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
static int amd_iommu_resume(struct sys_device *dev)
{
+ /*
+ * Disable IOMMUs before reprogramming the hardware registers.
+ * IOMMU is still enabled from the resume kernel.
+ */
+ disable_iommus();
+
+ /* re-load the hardware */
+ enable_iommus();
+
+ /*
+ * we have to flush after the IOMMUs are enabled because a
+ * disabled IOMMU will never execute the commands we send
+ */
+ amd_iommu_flush_all_domains();
+ amd_iommu_flush_all_devices();
+
return 0;
}
static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
{
- return -EINVAL;
+ /* disable IOMMUs to go out of the way for BIOS */
+ disable_iommus();
+
+ return 0;
}
static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
enable_iommus();
- printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
- (1 << (amd_iommu_aperture_order-20)));
-
printk(KERN_INFO "AMD IOMMU: device isolation ");
if (amd_iommu_isolate)
printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
*
****************************************************************************/
+static int __init parse_amd_iommu_dump(char *str)
+{
+ amd_iommu_dump = true;
+
+ return 1;
+}
+
static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
return 1;
}
-static int __init parse_amd_iommu_size_options(char *str)
-{
- unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
- if ((order > 24) && (order < 31))
- amd_iommu_aperture_order = order;
-
- return 1;
-}
-
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
__setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f24..076d3881f3d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
* Mikael Pettersson : PM converted to driver model.
*/
+#include <linux/perf_counter.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
@@ -34,6 +35,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
+#include <asm/perf_counter.h>
#include <asm/pgalloc.h>
#include <asm/atomic.h>
#include <asm/mpspec.h>
@@ -98,6 +100,29 @@ early_param("lapic", parse_lapic);
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase;
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go through APIC */
+ outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+ /* select IMCR register */
+ outb(0x70, 0x22);
+ /* NMI and 8259 INTR go directly to BSP */
+ outb(0x00, 0x23);
+}
#endif
#ifdef CONFIG_X86_64
@@ -111,13 +136,19 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
+int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
-int x2apic;
/* x2apic enabled before OS handover */
static int x2apic_preenabled;
static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
+ if (x2apic_enabled()) {
+ pr_warning("Bios already enabled x2apic, "
+ "can't enforce nox2apic");
+ return 0;
+ }
+
disable_x2apic = 1;
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
return 0;
@@ -209,6 +240,31 @@ static int modern_apic(void)
return lapic_get_version() >= 0x14;
}
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+static void native_apic_write_dummy(u32 reg, u32 v)
+{
+ WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+static u32 native_apic_read_dummy(u32 reg)
+{
+ WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+ return 0;
+}
+
+/*
+ * right after this call apic->write/read doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+ apic->read = native_apic_read_dummy;
+ apic->write = native_apic_write_dummy;
+}
+
void native_apic_wait_icr_idle(void)
{
while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +404,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
{
- unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
unsigned int v = (mask << 16) | (msg_type << 8) | vector;
apic_write(reg, v);
@@ -815,7 +871,7 @@ void clear_local_APIC(void)
u32 v;
/* APIC hasn't been mapped yet */
- if (!x2apic && !apic_phys)
+ if (!x2apic_mode && !apic_phys)
return;
maxlvt = lapic_get_maxlvt();
@@ -1133,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
+ perf_counters_lapic_init();
preempt_disable();
@@ -1287,7 +1344,7 @@ void check_x2apic(void)
{
if (x2apic_enabled()) {
pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic = 1;
+ x2apic_preenabled = x2apic_mode = 1;
}
}
@@ -1295,7 +1352,7 @@ void enable_x2apic(void)
{
int msr, msr2;
- if (!x2apic)
+ if (!x2apic_mode)
return;
rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1361,7 @@ void enable_x2apic(void)
wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
}
}
+#endif /* CONFIG_X86_X2APIC */
void __init enable_IR_x2apic(void)
{
@@ -1312,32 +1370,21 @@ void __init enable_IR_x2apic(void)
unsigned long flags;
struct IO_APIC_route_entry **ioapic_entries = NULL;
- if (!cpu_has_x2apic)
- return;
-
- if (!x2apic_preenabled && disable_x2apic) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of nox2apic\n");
- return;
+ ret = dmar_table_init();
+ if (ret) {
+ pr_debug("dmar_table_init() failed with %d:\n", ret);
+ goto ir_failed;
}
- if (x2apic_preenabled && disable_x2apic)
- panic("Bios already enabled x2apic, can't enforce nox2apic");
-
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling x2apic and Interrupt-remapping "
- "because of skipping io-apic setup\n");
- return;
+ if (!intr_remapping_supported()) {
+ pr_debug("intr-remapping not supported\n");
+ goto ir_failed;
}
- ret = dmar_table_init();
- if (ret) {
- pr_info("dmar_table_init() failed with %d:\n", ret);
- if (x2apic_preenabled)
- panic("x2apic enabled by bios. But IR enabling failed");
- else
- pr_info("Not enabling x2apic,Intr-remapping\n");
+ if (!x2apic_preenabled && skip_ioapic_setup) {
+ pr_info("Skipped enabling intr-remap because of skipping "
+ "io-apic setup\n");
return;
}
@@ -1357,19 +1404,16 @@ void __init enable_IR_x2apic(void)
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (ret && x2apic_preenabled) {
- local_irq_restore(flags);
- panic("x2apic enabled by bios. But IR enabling failed");
- }
-
+ ret = enable_intr_remapping(x2apic_supported());
if (ret)
goto end_restore;
- if (!x2apic) {
- x2apic = 1;
+ pr_info("Enabled Interrupt-remapping\n");
+
+ if (x2apic_supported() && !x2apic_mode) {
+ x2apic_mode = 1;
enable_x2apic();
+ pr_info("Enabled x2apic\n");
}
end_restore:
@@ -1378,37 +1422,34 @@ end_restore:
* IR enabling failed
*/
restore_IO_APIC_setup(ioapic_entries);
- else
- reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
unmask_8259A();
local_irq_restore(flags);
end:
- if (!ret) {
- if (!x2apic_preenabled)
- pr_info("Enabled x2apic and interrupt-remapping\n");
- else
- pr_info("Enabled Interrupt-remapping\n");
- } else
- pr_err("Failed to enable Interrupt-remapping and x2apic\n");
if (ioapic_entries)
free_ioapic_entries(ioapic_entries);
+
+ if (!ret)
+ return;
+
+ir_failed:
+ if (x2apic_preenabled)
+ panic("x2apic enabled by bios. But IR enabling failed");
+ else if (cpu_has_x2apic)
+ pr_info("Not enabling x2apic,Intr-remapping\n");
#else
if (!cpu_has_x2apic)
return;
if (x2apic_preenabled)
panic("x2apic enabled prior OS handover,"
- " enable CONFIG_INTR_REMAP");
-
- pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
- " and x2apic\n");
+ " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
#endif
return;
}
-#endif /* CONFIG_X86_X2APIC */
+
#ifdef CONFIG_X86_64
/*
@@ -1425,7 +1466,6 @@ static int __init detect_init_APIC(void)
}
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_physical_apicid = 0;
return 0;
}
#else
@@ -1539,32 +1579,49 @@ void __init early_init_lapic_mapping(void)
*/
void __init init_apic_mappings(void)
{
- if (x2apic) {
+ unsigned int new_apicid;
+
+ if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
return;
}
- /*
- * If no local APIC can be found then set up a fake all
- * zeroes page to simulate the local APIC and another
- * one for the IO-APIC.
- */
+ /* If no local APIC can be found return early */
if (!smp_found_config && detect_init_APIC()) {
- apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
- apic_phys = __pa(apic_phys);
- } else
+ /* lets NOP'ify apic operations */
+ pr_info("APIC: disable apic facility\n");
+ apic_disable();
+ } else {
apic_phys = mp_lapic_addr;
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
+ /*
+ * acpi lapic path already maps that address in
+ * acpi_register_lapic_address()
+ */
+ if (!acpi_lapic)
+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
+ apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+ APIC_BASE, apic_phys);
+ }
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = read_apic_id();
+ new_apicid = read_apic_id();
+ if (boot_cpu_physical_apicid != new_apicid) {
+ boot_cpu_physical_apicid = new_apicid;
+ /*
+ * yeah -- we lie about apic_version
+ * in case if apic was disabled via boot option
+ * but it's not a problem for SMP compiled kernel
+ * since smp_sanity_check is prepared for such a case
+ * and disable smp mode
+ */
+ apic_version[new_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
}
/*
@@ -1733,8 +1790,7 @@ void __init connect_bsp_APIC(void)
*/
apic_printk(APIC_VERBOSE, "leaving PIC mode, "
"enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
+ imcr_pic_to_apic();
}
#endif
if (apic->enable_apic_mode)
@@ -1762,8 +1818,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
*/
apic_printk(APIC_VERBOSE, "disabling APIC mode, "
"entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
+ imcr_apic_to_pic();
return;
}
#endif
@@ -1969,10 +2024,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
local_irq_save(flags);
disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
if (intr_remapping_enabled)
disable_intr_remapping();
-#endif
+
local_irq_restore(flags);
return 0;
}
@@ -1982,42 +2037,34 @@ static int lapic_resume(struct sys_device *dev)
unsigned int l, h;
unsigned long flags;
int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
- int ret;
+ int ret = 0;
struct IO_APIC_route_entry **ioapic_entries = NULL;
if (!apic_pm_state.active)
return 0;
local_irq_save(flags);
- if (x2apic) {
+ if (intr_remapping_enabled) {
ioapic_entries = alloc_ioapic_entries();
if (!ioapic_entries) {
WARN(1, "Alloc ioapic_entries in lapic resume failed.");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto restore;
}
ret = save_IO_APIC_setup(ioapic_entries);
if (ret) {
WARN(1, "Saving IO-APIC state failed: %d\n", ret);
free_ioapic_entries(ioapic_entries);
- return ret;
+ goto restore;
}
mask_IO_APIC_setup(ioapic_entries);
mask_8259A();
- enable_x2apic();
}
-#else
- if (!apic_pm_state.active)
- return 0;
- local_irq_save(flags);
- if (x2apic)
+ if (x2apic_mode)
enable_x2apic();
-#endif
-
else {
/*
* Make sure the APICBASE points to the right address
@@ -2055,21 +2102,16 @@ static int lapic_resume(struct sys_device *dev)
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
-#ifdef CONFIG_INTR_REMAP
- if (intr_remapping_enabled)
- reenable_intr_remapping(EIM_32BIT_APIC_ID);
-
- if (x2apic) {
+ if (intr_remapping_enabled) {
+ reenable_intr_remapping(x2apic_mode);
unmask_8259A();
restore_IO_APIC_setup(ioapic_entries);
free_ioapic_entries(ioapic_entries);
}
-#endif
-
+restore:
local_irq_restore(flags);
-
- return 0;
+ return ret;
}
/*
@@ -2117,31 +2159,14 @@ static void apic_pm_activate(void) { }
#endif /* CONFIG_PM */
#ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
{
int i, clusters, zeros;
unsigned id;
u16 *bios_cpu_apicid;
DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
- /*
- * there is not this kind of box with AMD CPU yet.
- * Some AMD box with quadcore cpu and 8 sockets apicid
- * will be [4, 0x23] or [8, 0x27] could be thought to
- * vsmp box still need checking...
- */
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
- return 0;
-
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
@@ -2177,18 +2202,67 @@ __cpuinit int apic_is_clustered_box(void)
++zeros;
}
- /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (is_vsmp_box() && clusters > 1)
+ return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+ if (multi)
+ return 0;
+ pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+ multi = 1;
+ return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+ {
+ .callback = set_multi,
+ .ident = "IBM System Summit2",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+ },
+ },
+ {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+ if (multi_checked)
+ return;
+
+ dmi_check_system(multi_dmi_table);
+ multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+ dmi_check_multi();
+ if (multi)
return 1;
+ if (!is_vsmp_box())
+ return 0;
+
/*
- * If clusters > 2, then should be multi-chassis.
- * May have to revisit this when multi-core + hyperthreaded CPUs come
- * out, but AFAIK this will work even for them.
+ * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
*/
- return (clusters > 2);
+ if (apic_cluster_num() > 1)
+ return 1;
+
+ return 0;
}
#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6..d0c99abc26c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
{
- return hard_smp_processor_id() >> index_msb;
+ return initial_apic_id >> index_msb;
}
struct apic apic_flat = {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
* regardless of how many processors are present (x86_64 ES7000
* is an example).
*/
- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "system APIC only can use physical flat");
return 1;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 30294777557..69328ac8de9 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
return gsi;
}
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
unsigned long vect = 0, psaival = 0;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e..1946fac42ab 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
+#include <asm/hw_irq.h>
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_irq.h>
@@ -129,12 +130,9 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
{
struct irq_pin_list *pin;
- int node;
-
- node = cpu_to_node(cpu);
pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
@@ -148,9 +146,6 @@ struct irq_cfg {
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- u8 move_desc_pending : 1;
-#endif
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -212,12 +207,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
return cfg;
}
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
{
struct irq_cfg *cfg;
- int node;
-
- node = cpu_to_node(cpu);
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
if (cfg) {
@@ -238,13 +230,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
return cfg;
}
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
cfg = desc->chip_data;
if (!cfg) {
- desc->chip_data = get_one_free_irq_cfg(cpu);
+ desc->chip_data = get_one_free_irq_cfg(node);
if (!desc->chip_data) {
printk(KERN_ERR "can not alloc irq_cfg\n");
BUG_ON(1);
@@ -254,10 +246,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
return 0;
}
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
{
struct irq_pin_list *old_entry, *head, *tail, *entry;
@@ -266,7 +257,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
if (!old_entry)
return;
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry)
return;
@@ -276,7 +267,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
tail = entry;
old_entry = old_entry->next;
while (old_entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
entry = head;
while (entry) {
@@ -316,12 +307,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
}
void arch_init_copy_chip_data(struct irq_desc *old_desc,
- struct irq_desc *desc, int cpu)
+ struct irq_desc *desc, int node)
{
struct irq_cfg *cfg;
struct irq_cfg *old_cfg;
- cfg = get_one_free_irq_cfg(cpu);
+ cfg = get_one_free_irq_cfg(node);
if (!cfg)
return;
@@ -332,7 +323,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
- init_copy_irq_2_pin(old_cfg, cfg, cpu);
+ init_copy_irq_2_pin(old_cfg, cfg, node);
}
static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +347,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
old_desc->chip_data = NULL;
}
}
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg = desc->chip_data;
-
- if (!cfg->move_in_progress) {
- /* it means that domain is not changed */
- if (!cpumask_intersects(desc->affinity, mask))
- cfg->move_desc_pending = 1;
- }
-}
-#endif
+/* end for move_irq_desc */
#else
static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +357,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
#endif
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -518,132 +490,18 @@ static void ioapic_mask_entry(int apic, int pin)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- cfg->move_cleanup_count = 0;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- cfg->move_cleanup_count++;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
- u8 vector = cfg->vector;
-
- entry = cfg->irq_2_pin;
- for (;;) {
- unsigned int reg;
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- /*
- * With interrupt-remapping, destination information comes
- * from interrupt-remapping table entry.
- */
- if (!irq_remapped(irq))
- io_apic_write(apic, 0x11 + pin*2, dest);
- reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~IO_APIC_REDIR_VECTOR_MASK;
- reg |= vector;
- io_apic_modify(apic, 0x10 + pin*2, reg);
- if (!entry->next)
- break;
- entry = entry->next;
- }
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned int irq;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return BAD_APICID;
-
- irq = desc->irq;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
-
- /* check that before desc->addinity get updated */
- set_extra_move_desc(desc, mask);
-
- cpumask_copy(desc->affinity, mask);
-
- return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- dest = set_desc_affinity(desc, mask);
- if (dest != BAD_APICID) {
- /* Only the high 8 bits are valid. */
- dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, cfg);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
- struct irq_desc *desc;
-
- desc = irq_to_desc(irq);
-
- set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
struct irq_pin_list *entry;
entry = cfg->irq_2_pin;
if (!entry) {
- entry = get_one_free_irq_2_pin(cpu);
+ entry = get_one_free_irq_2_pin(node);
if (!entry) {
printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
apic, pin);
@@ -663,7 +521,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
entry = entry->next;
}
- entry->next = get_one_free_irq_2_pin(cpu);
+ entry->next = get_one_free_irq_2_pin(node);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -672,7 +530,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
int oldapic, int oldpin,
int newapic, int newpin)
{
@@ -692,7 +550,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
/* why? call replace before add? */
if (!replaced)
- add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+ add_pin_to_irq_node(cfg, node, newapic, newpin);
}
static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +708,6 @@ static int __init ioapic_pirq_setup(char *str)
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_INTR_REMAP
struct IO_APIC_route_entry **alloc_ioapic_entries(void)
{
int apic;
@@ -948,20 +805,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
return 0;
}
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
- struct IO_APIC_route_entry **ioapic_entries)
-
-{
- /*
- * for now plain restore of previous settings.
- * TBD: In the case of OS enabling interrupt-remapping,
- * IO-APIC RTE's need to be setup to point to interrupt-remapping
- * table entries. for now, do a plain restore, and wait for
- * the setup_IO_APIC_irqs() to do proper initialization.
- */
- restore_IO_APIC_setup(ioapic_entries);
-}
-
void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
{
int apic;
@@ -971,7 +814,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
kfree(ioapic_entries);
}
-#endif
/*
* Find the IRQ entry number of a certain pin.
@@ -1032,54 +874,6 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
- int apic, i, best_guess = -1;
-
- apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
- if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
- return -1;
- }
- for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].srcbus;
-
- for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
- break;
-
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
- if (!(apic || IO_APIC_IRQ(irq)))
- continue;
-
- if (pin == (mp_irqs[i].srcbusirq & 3))
- return irq;
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0)
- best_guess = irq;
- }
- }
- return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
/*
* EISA Edge/Level control register, ELCR
@@ -1298,6 +1092,64 @@ static int pin_2_irq(int idx, int apic, int pin)
return irq;
}
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG,
+ "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (test_bit(bus, mp_bus_not_pci)) {
+ apic_printk(APIC_VERBOSE,
+ "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+ mp_irqs[i].dstapic == MP_APIC_ALL)
+ break;
+
+ if (!test_bit(lbus, mp_bus_not_pci) &&
+ !mp_irqs[i].irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ return irq;
+ }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0) {
+ set_io_apic_irq_attr(irq_attr, apic,
+ mp_irqs[i].dstirq,
+ irq_trigger(i),
+ irq_polarity(i));
+ best_guess = irq;
+ }
+ }
+ }
+ return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
void lock_vector_lock(void)
{
/* Used to the online set of cpus does not change
@@ -1628,58 +1480,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
ioapic_write_entry(apic_id, pin, entry);
}
+static struct {
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
static void __init setup_IO_APIC_irqs(void)
{
- int apic_id, pin, idx, irq;
+ int apic_id = 0, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
- for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
- idx = find_irq_entry(apic_id, pin, mp_INT);
- if (idx == -1) {
- if (!notcon) {
- notcon = 1;
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- } else
- apic_printk(APIC_VERBOSE, " %d-%d",
- mp_ioapics[apic_id].apicid, pin);
- continue;
- }
- if (notcon) {
- apic_printk(APIC_VERBOSE,
- " (apicid-pin) not connected\n");
- notcon = 0;
- }
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ apic_id = mp_find_ioapic(0);
+ if (apic_id < 0)
+ apic_id = 0;
+ }
+#endif
- irq = pin_2_irq(idx, apic_id, pin);
+ for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+ idx = find_irq_entry(apic_id, pin, mp_INT);
+ if (idx == -1) {
+ if (!notcon) {
+ notcon = 1;
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ } else
+ apic_printk(APIC_VERBOSE, " %d-%d",
+ mp_ioapics[apic_id].apicid, pin);
+ continue;
+ }
+ if (notcon) {
+ apic_printk(APIC_VERBOSE,
+ " (apicid-pin) not connected\n");
+ notcon = 0;
+ }
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(apic_id, irq))
- continue;
+ irq = pin_2_irq(idx, apic_id, pin);
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc for %d\n", irq);
- continue;
- }
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+ /*
+ * Skip the timer IRQ if there's a quirk handler
+ * installed and if it returns 1:
+ */
+ if (apic->multi_timer_check &&
+ apic->multi_timer_check(apic_id, irq))
+ continue;
- setup_IO_APIC_irq(apic_id, pin, irq, desc,
- irq_trigger(idx), irq_polarity(idx));
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+ continue;
}
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, apic_id, pin);
+ /*
+ * don't mark it in pin_programmed, so later acpi could
+ * set it correctly when irq < 16
+ */
+ setup_IO_APIC_irq(apic_id, pin, irq, desc,
+ irq_trigger(idx), irq_polarity(idx));
}
if (notcon)
@@ -1869,7 +1733,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
__apicdebuginit(void) print_local_APIC(void *dummy)
{
- unsigned int v, ver, maxlvt;
+ unsigned int i, v, ver, maxlvt;
u64 icr;
if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
v = apic_read(APIC_TDCR);
printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
printk("\n");
}
@@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void)
__apicdebuginit(int) print_all_ICs(void)
{
print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic || disable_apic)
+ return 0;
+
print_all_local_APICs();
print_IO_APIC();
@@ -2360,6 +2241,118 @@ static int ioapic_retrigger_irq(unsigned int irq)
*/
#ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+ u8 vector = cfg->vector;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+ unsigned int reg;
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ /*
+ * With interrupt-remapping, destination information comes
+ * from interrupt-remapping table entry.
+ */
+ if (!irq_remapped(irq))
+ io_apic_write(apic, 0x11 + pin*2, dest);
+ reg = io_apic_read(apic, 0x10 + pin*2);
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+ reg |= vector;
+ io_apic_modify(apic, 0x10 + pin*2, reg);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+}
+
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned int irq;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ cpumask_copy(desc->affinity, mask);
+
+ return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int dest;
+ unsigned int irq;
+ int ret = -1;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return ret;
+}
+
+static int
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+ struct irq_desc *desc;
+
+ desc = irq_to_desc(irq);
+
+ return set_ioapic_affinity_irq_desc(desc, mask);
+}
#ifdef CONFIG_INTR_REMAP
@@ -2374,26 +2367,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
* Real vector that is used for interrupting cpu will be coming from
* the interrupt-remapping table entry.
*/
-static void
+static int
migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
struct irte irte;
unsigned int dest;
unsigned int irq;
+ int ret = -1;
if (!cpumask_intersects(mask, cpu_online_mask))
- return;
+ return ret;
irq = desc->irq;
if (get_irte(irq, &irte))
- return;
+ return ret;
cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
+ return ret;
dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
@@ -2409,27 +2401,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
send_cleanup_vector(cfg);
cpumask_copy(desc->affinity, mask);
+
+ return 0;
}
/*
* Migrates the IRQ destination in the process context.
*/
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
- migrate_ioapic_irq_desc(desc, mask);
+ return migrate_ioapic_irq_desc(desc, mask);
}
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- set_ir_ioapic_affinity_irq_desc(desc, mask);
+ return set_ir_ioapic_affinity_irq_desc(desc, mask);
}
#else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
const struct cpumask *mask)
{
+ return 0;
}
#endif
@@ -2491,86 +2486,19 @@ static void irq_complete_move(struct irq_desc **descp)
struct irq_cfg *cfg = desc->chip_data;
unsigned vector, me;
- if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- if (likely(!cfg->move_desc_pending))
- return;
-
- /* domain has not changed, but affinity did */
- me = smp_processor_id();
- if (cpumask_test_cpu(me, desc->affinity)) {
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
- cfg->move_desc_pending = 0;
- }
-#endif
+ if (likely(!cfg->move_in_progress))
return;
- }
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
- *descp = desc = move_irq_desc(desc, me);
- /* get the new one */
- cfg = desc->chip_data;
-#endif
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
send_cleanup_vector(cfg);
- }
}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
#endif
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
- int apic, pin;
- struct irq_pin_list *entry;
-
- entry = cfg->irq_2_pin;
- for (;;) {
-
- if (!entry)
- break;
-
- apic = entry->apic;
- pin = entry->pin;
- io_apic_eoi(apic, pin);
- entry = entry->next;
- }
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int irq;
-
- irq = desc->irq;
- cfg = desc->chip_data;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- __eoi_ioapic_irq(irq, cfg);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- ack_x2APIC_irq();
- eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
- ack_x2APIC_irq();
-}
-#endif
-
static void ack_apic_edge(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2562,6 @@ static void ack_apic_level(unsigned int irq)
*/
ack_APIC_irq();
- if (irq_remapped(irq))
- eoi_ioapic_irq(desc);
-
/* Now we can move and renable the irq */
if (unlikely(do_unmask_irq)) {
/* Only migrate the irq if the ack has been received.
@@ -2683,22 +2608,50 @@ static void ack_apic_level(unsigned int irq)
}
#ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+ int apic, pin;
+ struct irq_pin_list *entry;
+
+ entry = cfg->irq_2_pin;
+ for (;;) {
+
+ if (!entry)
+ break;
+
+ apic = entry->apic;
+ pin = entry->pin;
+ io_apic_eoi(apic, pin);
+ entry = entry->next;
+ }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __eoi_ioapic_irq(irq, cfg);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
static void ir_ack_apic_edge(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_edge(irq);
-#endif
- return ack_apic_edge(irq);
+ ack_APIC_irq();
}
static void ir_ack_apic_level(unsigned int irq)
{
-#ifdef CONFIG_X86_X2APIC
- if (x2apic_enabled())
- return ack_x2apic_level(irq);
-#endif
- return ack_apic_level(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ ack_APIC_irq();
+ eoi_ioapic_irq(desc);
}
#endif /* CONFIG_INTR_REMAP */
@@ -2903,7 +2856,7 @@ static inline void __init check_timer(void)
{
struct irq_desc *desc = irq_to_desc(0);
struct irq_cfg *cfg = desc->chip_data;
- int cpu = boot_cpu_id;
+ int node = cpu_to_node(boot_cpu_id);
int apic1, pin1, apic2, pin2;
unsigned long flags;
int no_pin1 = 0;
@@ -2969,7 +2922,7 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
- add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+ add_pin_to_irq_node(cfg, node, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
} else {
/* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2959,7 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
- replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
enable_8259A_irq(0);
if (timer_irq_works()) {
@@ -3218,14 +3171,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
*/
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
{
/* Allocate an unused irq */
unsigned int irq;
unsigned int new;
unsigned long flags;
struct irq_cfg *cfg_new = NULL;
- int cpu = boot_cpu_id;
struct irq_desc *desc_new = NULL;
irq = 0;
@@ -3234,7 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
- desc_new = irq_to_desc_alloc_cpu(new, cpu);
+ desc_new = irq_to_desc_alloc_node(new, node);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
continue;
@@ -3243,6 +3195,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (cfg_new->vector != 0)
continue;
+
+ desc_new = move_irq_desc(desc_new, node);
+
if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
break;
@@ -3260,11 +3215,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
int create_irq(void)
{
+ int node = cpu_to_node(boot_cpu_id);
unsigned int irq_want;
int irq;
irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
irq = -1;
@@ -3366,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
}
#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3375,7 +3331,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3387,13 +3343,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg_desc(desc, &msg);
+
+ return 0;
}
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
-static void
+static int
ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3360,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
struct irte irte;
if (get_irte(irq, &irte))
- return;
+ return -1;
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3381,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
*/
if (cfg->move_in_progress)
send_cleanup_vector(cfg);
+
+ return 0;
}
#endif
@@ -3518,15 +3478,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
unsigned int irq_want;
struct intel_iommu *iommu = NULL;
int index = 0;
+ int node;
/* x86 doesn't support multiple MSI yet */
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
+ node = dev_to_node(&dev->dev);
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, node);
if (irq == 0)
return -1;
irq_want = irq + 1;
@@ -3576,7 +3538,7 @@ void arch_teardown_msi_irq(unsigned int irq)
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
#ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3585,7 +3547,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3597,6 +3559,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -3630,7 +3594,7 @@ int arch_setup_dmar_msi(unsigned int irq)
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3639,7 +3603,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
@@ -3651,6 +3615,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
+
+ return 0;
}
#endif /* CONFIG_SMP */
@@ -3707,7 +3673,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
write_ht_irq_msg(irq, &msg);
}
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
@@ -3715,11 +3681,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
dest = set_desc_affinity(desc, mask);
if (dest == BAD_APICID)
- return;
+ return -1;
cfg = desc->chip_data;
target_ht_irq(irq, dest, cfg->vector);
+
+ return 0;
}
#endif
@@ -3794,6 +3762,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long flags;
int err;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
cfg = irq_cfg(irq);
err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3777,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
- entry->vector = cfg->vector;
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->polarity = 0;
- entry->trigger = 0;
- entry->mask = 0;
- entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+ entry->vector = cfg->vector;
+ entry->delivery_mode = apic->irq_delivery_mode;
+ entry->dest_mode = apic->irq_dest_mode;
+ entry->polarity = 0;
+ entry->trigger = 0;
+ entry->mask = 0;
+ entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3801,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
struct uv_IO_APIC_route_entry *entry;
int mmr_pnode;
+ BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
mmr_value = 0;
entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
- BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
entry->mask = 1;
mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3868,71 @@ int __init arch_probe_nr_irqs(void)
}
#endif
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+ int node;
+ int ioapic, pin;
+ int trigger, polarity;
+
+ ioapic = irq_attr->ioapic;
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ if (dev)
+ node = dev_to_node(dev);
+ else
+ node = cpu_to_node(boot_cpu_id);
+
+ desc = irq_to_desc_alloc_node(irq, node);
+ if (!desc) {
+ printk(KERN_INFO "can not get irq_desc %d\n", irq);
+ return 0;
+ }
+
+ pin = irq_attr->ioapic_pin;
+ trigger = irq_attr->trigger;
+ polarity = irq_attr->polarity;
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= NR_IRQS_LEGACY) {
+ cfg = desc->chip_data;
+ add_pin_to_irq_node(cfg, node, ioapic, pin);
+ }
+
+ setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+
+ return 0;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+ struct io_apic_irq_attr *irq_attr)
+{
+ int ioapic, pin;
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ ioapic = irq_attr->ioapic;
+ pin = irq_attr->ioapic_pin;
+ if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ pr_debug("Pin %d-%d already programmed\n",
+ mp_ioapics[ioapic].apicid, pin);
+ return 0;
+ }
+ set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+ return __io_apic_set_pci_routing(dev, irq, irq_attr);
+}
+
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
@@ -3980,6 +4013,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
return apic_id;
}
+#endif
int __init io_apic_get_version(int ioapic)
{
@@ -3992,39 +4026,6 @@ int __init io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-#endif
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- int cpu = boot_cpu_id;
-
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- ioapic);
- return -EINVAL;
- }
-
- desc = irq_to_desc_alloc_cpu(irq, cpu);
- if (!desc) {
- printk(KERN_INFO "can not get irq_desc %d\n", irq);
- return 0;
- }
-
- /*
- * IRQs < 16 are already in the irq_2_pin[] map
- */
- if (irq >= NR_IRQS_LEGACY) {
- cfg = desc->chip_data;
- add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
- }
-
- setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
- return 0;
-}
-
int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
{
@@ -4055,51 +4056,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
- int pin, ioapic, irq, irq_entry;
+ int pin, ioapic = 0, irq, irq_entry;
struct irq_desc *desc;
- struct irq_cfg *cfg;
const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
- for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
- irq_entry = find_irq_entry(ioapic, pin, mp_INT);
- if (irq_entry == -1)
- continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
-
- /* setup_IO_APIC_irqs could fail to get vector for some device
- * when you have too many devices, because at that time only boot
- * cpu is online.
- */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- if (!cfg->vector) {
- setup_IO_APIC_irq(ioapic, pin, irq, desc,
- irq_trigger(irq_entry),
- irq_polarity(irq_entry));
- continue;
+#ifdef CONFIG_ACPI
+ if (!acpi_disabled && acpi_ioapic) {
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ ioapic = 0;
+ }
+#endif
- }
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
- /*
- * Honour affinities which have been set in early boot
- */
- if (desc->status &
- (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
- else
- mask = apic->target_cpus();
+ desc = irq_to_desc(irq);
- if (intr_remapping_enabled)
- set_ir_ioapic_affinity_irq_desc(desc, mask);
- else
- set_ioapic_affinity_irq_desc(desc, mask);
- }
+ /*
+ * Honour affinities which have been set in early boot
+ */
+ if (desc->status &
+ (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+ mask = desc->affinity;
+ else
+ mask = apic->target_cpus();
+ if (intr_remapping_enabled)
+ set_ir_ioapic_affinity_irq_desc(desc, mask);
+ else
+ set_ioapic_affinity_irq_desc(desc, mask);
}
+
}
#endif
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index ce4fbfa315a..a691302dc3f 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
{
printk(KERN_CONT "\n");
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e..440a8bccd91 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
extern struct apic apic_bigsmp;
extern struct apic apic_es7000;
extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
struct apic *apic = &apic_default;
EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e..bc3e880f9b8 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
void __init default_setup_apic_routing(void)
{
#ifdef CONFIG_X86_X2APIC
- if (x2apic && (apic != &apic_x2apic_phys &&
+ if (x2apic_mode && (apic != &apic_x2apic_phys &&
#ifdef CONFIG_X86_UV
apic != &apic_x2apic_uv_x &&
#endif
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d8..344eee4ac0a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
}
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT 4
-#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
static const struct cpumask *summit_target_cpus(void)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d1..8e4cbb255c3 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
#include <asm/apic.h>
#include <asm/ipi.h>
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda6935297..ef0ae207a7c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
cpumask_set_cpu(cpu, retmask);
}
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
#ifdef CONFIG_SMP
unsigned long val;
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
- int max_pnode = 0;
+ int gnode_extra, max_pnode = 0;
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
+ pnode_mask = (1 << n_val) - 1;
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+ gnode_upper = ((unsigned long)gnode_extra << m_val);
+ printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+ n_val, m_val, gnode_upper, gnode_extra);
+
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_blade_info);
get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_node_to_blade);
memset(uv_node_to_blade, 255, bytes);
bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+ BUG_ON(!uv_cpu_to_blade);
memset(uv_cpu_to_blade, 255, bytes);
blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
}
}
- pnode_mask = (1 << n_val) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- gnode_upper = (((unsigned long)node_id.s.node_id) &
- ~((1 << n_val) - 1)) << m_val;
-
uv_bios_init();
uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
&sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+ uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162..1a830cbd701 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b..898ecc47e12 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+ OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e..3efcb2b96a1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
#
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
#
# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
-obj-$(CONFIG_X86_MCE) += mcheck/
-obj-$(CONFIG_MTRR) += mtrr/
-obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE) += mcheck/
+obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa6..e5b27d8f1b4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
# include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
int cpu = smp_processor_id();
int node;
- unsigned apicid = hard_smp_processor_id();
+ unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
node = c->phys_proc_id;
if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
(c->x86_model == 8 && c->x86_mask >= 8))
set_cpu_cap(c, X86_FEATURE_K6_MTRR);
#endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+ /* check CPU config space for extended APIC ID */
+ if (c->x86 >= 0xf) {
+ unsigned int val;
+ val = read_pci_config(0, 24, 0, 0x68);
+ if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+ set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+ }
+#endif
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77848d9fca6..3ffdcfa9abd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
#include <linux/io.h>
#include <asm/stackprotector.h>
+#include <asm/perf_counter.h>
#include <asm/mmu_context.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -299,7 +300,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */
}
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
void load_percpu_segment(int cpu)
{
@@ -768,6 +770,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_identify)
this_cpu->c_identify(c);
+ /* Clear/Set all flags overriden by options, after probe */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
#ifdef CONFIG_X86_64
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
#endif
@@ -813,6 +821,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
#endif
init_hypervisor(c);
+
+ /*
+ * Clear/Set all flags overriden by options, need do it
+ * before following smp all cpus cap AND.
+ */
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+
/*
* On SMP, boot_cpu_data holds the common feature set between
* all CPUs; so make sure that we indicate which features are
@@ -825,10 +843,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
- /* Clear all flags overriden by options */
- for (i = 0; i < NCAPINTS; i++)
- c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
#ifdef CONFIG_X86_MCE
/* Init Machine Check Exception if available. */
mcheck_init(c);
@@ -861,6 +875,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
+ init_hw_perf_counters();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6..6b2a52dd040 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
static DEFINE_MUTEX(cpu_debug_lock);
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
{ "value", CPU_REG_ALL, 1 },
};
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_INTEL_ALL },
- { 0x00000006, 0x00000007, CPU_MONITOR, CPU_CX_AT_XE },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_INTEL_ALL },
- { 0x00000011, 0x00000013, CPU_PMC, CPU_INTEL_PENTIUM },
- { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_P6_CX_AT_XE },
-
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_PX_CX_AT_XE },
- { 0x0000002B, 0x0000002B, CPU_POWERON, CPU_INTEL_XEON },
- { 0x0000002C, 0x0000002C, CPU_FREQ, CPU_INTEL_XEON },
- { 0x0000003A, 0x0000003A, CPU_CONTROL, CPU_CX_AT_XE },
-
- { 0x00000040, 0x00000043, CPU_LBRANCH, CPU_PM_CX_AT_XE },
- { 0x00000044, 0x00000047, CPU_LBRANCH, CPU_PM_CO_AT },
- { 0x00000060, 0x00000063, CPU_LBRANCH, CPU_C2_AT },
- { 0x00000064, 0x00000067, CPU_LBRANCH, CPU_INTEL_ATOM },
-
- { 0x00000079, 0x00000079, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x00000088, 0x0000008A, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000008B, 0x0000008B, CPU_BIOS, CPU_P6_CX_AT_XE },
- { 0x0000009B, 0x0000009B, CPU_MONITOR, CPU_INTEL_XEON },
-
- { 0x000000C1, 0x000000C2, CPU_PMC, CPU_P6_CX_AT },
- { 0x000000CD, 0x000000CD, CPU_FREQ, CPU_CX_AT },
- { 0x000000E7, 0x000000E8, CPU_PERF, CPU_CX_AT },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000116, 0x00000116, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000118, 0x00000118, CPU_CACHE, CPU_INTEL_P6 },
- { 0x00000119, 0x00000119, CPU_CACHE, CPU_INTEL_PX },
- { 0x0000011A, 0x0000011B, CPU_CACHE, CPU_INTEL_P6 },
- { 0x0000011E, 0x0000011E, CPU_CACHE, CPU_PX_CX_AT },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE },
- { 0x00000179, 0x0000017A, CPU_MC, CPU_PX_CX_AT_XE },
- { 0x0000017B, 0x0000017B, CPU_MC, CPU_P6_XE },
- { 0x00000186, 0x00000187, CPU_PMC, CPU_P6_CX_AT },
- { 0x00000198, 0x00000199, CPU_PERF, CPU_PM_CX_AT_XE },
- { 0x0000019A, 0x0000019A, CPU_TIME, CPU_PM_CX_AT_XE },
- { 0x0000019B, 0x0000019D, CPU_THERM, CPU_PM_CX_AT_XE },
- { 0x000001A0, 0x000001A0, CPU_MISC, CPU_PM_CX_AT_XE },
-
- { 0x000001C9, 0x000001C9, CPU_LBRANCH, CPU_PM_CX_AT },
- { 0x000001D7, 0x000001D8, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_CX_AT_XE },
- { 0x000001DA, 0x000001DA, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000001DB, 0x000001DB, CPU_LBRANCH, CPU_P6_XE },
- { 0x000001DC, 0x000001DC, CPU_LBRANCH, CPU_INTEL_P6 },
- { 0x000001DD, 0x000001DE, CPU_LBRANCH, CPU_PX_CX_AT_XE },
- { 0x000001E0, 0x000001E0, CPU_LBRANCH, CPU_INTEL_P6 },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_P6_CX_XE },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_C2_AT_XE },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_P6_CX_XE },
-
- { 0x00000300, 0x00000308, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000309, 0x0000030B, CPU_PMC, CPU_C2_AT_XE },
- { 0x0000030C, 0x00000311, CPU_PMC, CPU_INTEL_XEON },
- { 0x00000345, 0x00000345, CPU_PMC, CPU_C2_AT },
- { 0x00000360, 0x00000371, CPU_PMC, CPU_INTEL_XEON },
- { 0x0000038D, 0x00000390, CPU_PMC, CPU_C2_AT },
- { 0x000003A0, 0x000003BE, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003C0, 0x000003CD, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003E0, 0x000003E1, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F0, 0x000003F0, CPU_PMC, CPU_INTEL_XEON },
- { 0x000003F1, 0x000003F1, CPU_PMC, CPU_C2_AT_XE },
- { 0x000003F2, 0x000003F2, CPU_PMC, CPU_INTEL_XEON },
-
- { 0x00000400, 0x00000402, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000403, 0x00000403, CPU_MC, CPU_INTEL_XEON },
- { 0x00000404, 0x00000406, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000407, 0x00000407, CPU_MC, CPU_INTEL_XEON },
- { 0x00000408, 0x0000040A, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x0000040B, 0x0000040B, CPU_MC, CPU_INTEL_XEON },
- { 0x0000040C, 0x0000040E, CPU_MC, CPU_PM_CX_XE },
- { 0x0000040F, 0x0000040F, CPU_MC, CPU_INTEL_XEON },
- { 0x00000410, 0x00000412, CPU_MC, CPU_PM_CX_AT_XE },
- { 0x00000413, 0x00000417, CPU_MC, CPU_CX_AT_XE },
- { 0x00000480, 0x0000048B, CPU_VMX, CPU_CX_AT_XE },
-
- { 0x00000600, 0x00000600, CPU_DEBUG, CPU_PM_CX_AT_XE },
- { 0x00000680, 0x0000068F, CPU_LBRANCH, CPU_INTEL_XEON },
- { 0x000006C0, 0x000006CF, CPU_LBRANCH, CPU_INTEL_XEON },
-
- { 0x000107CC, 0x000107D3, CPU_PMC, CPU_INTEL_XEON_MP },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON },
- { 0xC0000081, 0xC0000082, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000084, 0xC0000084, CPU_CALL, CPU_INTEL_XEON },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_INTEL_XEON },
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+ { 0x00000000, 0x00000001, CPU_MC, },
+ { 0x00000006, 0x00000007, CPU_MONITOR, },
+ { 0x00000010, 0x00000010, CPU_TIME, },
+ { 0x00000011, 0x00000013, CPU_PMC, },
+ { 0x00000017, 0x00000017, CPU_PLATFORM, },
+ { 0x0000001B, 0x0000001B, CPU_APIC, },
+ { 0x0000002A, 0x0000002B, CPU_POWERON, },
+ { 0x0000002C, 0x0000002C, CPU_FREQ, },
+ { 0x0000003A, 0x0000003A, CPU_CONTROL, },
+ { 0x00000040, 0x00000047, CPU_LBRANCH, },
+ { 0x00000060, 0x00000067, CPU_LBRANCH, },
+ { 0x00000079, 0x00000079, CPU_BIOS, },
+ { 0x00000088, 0x0000008A, CPU_CACHE, },
+ { 0x0000008B, 0x0000008B, CPU_BIOS, },
+ { 0x0000009B, 0x0000009B, CPU_MONITOR, },
+ { 0x000000C1, 0x000000C4, CPU_PMC, },
+ { 0x000000CD, 0x000000CD, CPU_FREQ, },
+ { 0x000000E7, 0x000000E8, CPU_PERF, },
+ { 0x000000FE, 0x000000FE, CPU_MTRR, },
+
+ { 0x00000116, 0x0000011E, CPU_CACHE, },
+ { 0x00000174, 0x00000176, CPU_SYSENTER, },
+ { 0x00000179, 0x0000017B, CPU_MC, },
+ { 0x00000186, 0x00000189, CPU_PMC, },
+ { 0x00000198, 0x00000199, CPU_PERF, },
+ { 0x0000019A, 0x0000019A, CPU_TIME, },
+ { 0x0000019B, 0x0000019D, CPU_THERM, },
+ { 0x000001A0, 0x000001A0, CPU_MISC, },
+ { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
+ { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
+ { 0x000001D9, 0x000001D9, CPU_DEBUG, },
+ { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
+
+ { 0x00000200, 0x0000020F, CPU_MTRR, },
+ { 0x00000250, 0x00000250, CPU_MTRR, },
+ { 0x00000258, 0x00000259, CPU_MTRR, },
+ { 0x00000268, 0x0000026F, CPU_MTRR, },
+ { 0x00000277, 0x00000277, CPU_PAT, },
+ { 0x000002FF, 0x000002FF, CPU_MTRR, },
+
+ { 0x00000300, 0x00000311, CPU_PMC, },
+ { 0x00000345, 0x00000345, CPU_PMC, },
+ { 0x00000360, 0x00000371, CPU_PMC, },
+ { 0x0000038D, 0x00000390, CPU_PMC, },
+ { 0x000003A0, 0x000003BE, CPU_PMC, },
+ { 0x000003C0, 0x000003CD, CPU_PMC, },
+ { 0x000003E0, 0x000003E1, CPU_PMC, },
+ { 0x000003F0, 0x000003F2, CPU_PMC, },
+
+ { 0x00000400, 0x00000417, CPU_MC, },
+ { 0x00000480, 0x0000048B, CPU_VMX, },
+
+ { 0x00000600, 0x00000600, CPU_DEBUG, },
+ { 0x00000680, 0x0000068F, CPU_LBRANCH, },
+ { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
+
+ { 0x000107CC, 0x000107D3, CPU_PMC, },
+
+ { 0xC0000080, 0xC0000080, CPU_FEATURES, },
+ { 0xC0000081, 0xC0000084, CPU_CALL, },
+ { 0xC0000100, 0xC0000102, CPU_BASE, },
+ { 0xC0000103, 0xC0000103, CPU_TIME, },
+
+ { 0xC0010000, 0xC0010007, CPU_PMC, },
+ { 0xC0010010, 0xC0010010, CPU_CONF, },
+ { 0xC0010015, 0xC0010015, CPU_CONF, },
+ { 0xC0010016, 0xC001001A, CPU_MTRR, },
+ { 0xC001001D, 0xC001001D, CPU_MTRR, },
+ { 0xC001001F, 0xC001001F, CPU_CONF, },
+ { 0xC0010030, 0xC0010035, CPU_BIOS, },
+ { 0xC0010044, 0xC0010048, CPU_MC, },
+ { 0xC0010050, 0xC0010056, CPU_SMM, },
+ { 0xC0010058, 0xC0010058, CPU_CONF, },
+ { 0xC0010060, 0xC0010060, CPU_CACHE, },
+ { 0xC0010061, 0xC0010068, CPU_SMM, },
+ { 0xC0010069, 0xC001006B, CPU_SMM, },
+ { 0xC0010070, 0xC0010071, CPU_SMM, },
+ { 0xC0010111, 0xC0010113, CPU_SMM, },
+ { 0xC0010114, 0xC0010118, CPU_SVM, },
+ { 0xC0010140, 0xC0010141, CPU_OSVM, },
+ { 0xC0011022, 0xC0011023, CPU_CONF, },
};
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
- { 0x00000000, 0x00000001, CPU_MC, CPU_K10_PLUS, },
- { 0x00000010, 0x00000010, CPU_TIME, CPU_K8_PLUS, },
- { 0x0000001B, 0x0000001B, CPU_APIC, CPU_K8_PLUS, },
- { 0x0000002A, 0x0000002A, CPU_POWERON, CPU_K7_PLUS },
- { 0x0000008B, 0x0000008B, CPU_VER, CPU_K8_PLUS },
- { 0x000000FE, 0x000000FE, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS, },
- { 0x00000179, 0x0000017B, CPU_MC, CPU_K8_PLUS, },
- { 0x000001D9, 0x000001D9, CPU_DEBUG, CPU_K8_PLUS, },
- { 0x000001DB, 0x000001DE, CPU_LBRANCH, CPU_K8_PLUS, },
-
- { 0x00000200, 0x0000020F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000250, 0x00000250, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000258, 0x00000259, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000268, 0x0000026F, CPU_MTRR, CPU_K8_PLUS, },
- { 0x00000277, 0x00000277, CPU_PAT, CPU_K8_PLUS, },
- { 0x000002FF, 0x000002FF, CPU_MTRR, CPU_K8_PLUS, },
-
- { 0x00000400, 0x00000413, CPU_MC, CPU_K8_PLUS, },
-
- { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL, },
- { 0xC0000081, 0xC0000084, CPU_CALL, CPU_K8_PLUS, },
- { 0xC0000100, 0xC0000102, CPU_BASE, CPU_K8_PLUS, },
- { 0xC0000103, 0xC0000103, CPU_TIME, CPU_K10_PLUS, },
-
- { 0xC0010000, 0xC0010007, CPU_PMC, CPU_K8_PLUS, },
- { 0xC0010010, 0xC0010010, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010015, 0xC0010015, CPU_CONF, CPU_K7_PLUS, },
- { 0xC0010016, 0xC001001A, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001D, 0xC001001D, CPU_MTRR, CPU_K8_PLUS, },
- { 0xC001001F, 0xC001001F, CPU_CONF, CPU_K8_PLUS, },
- { 0xC0010030, 0xC0010035, CPU_BIOS, CPU_K8_PLUS, },
- { 0xC0010044, 0xC0010048, CPU_MC, CPU_K8_PLUS, },
- { 0xC0010050, 0xC0010056, CPU_SMM, CPU_K0F_PLUS, },
- { 0xC0010058, 0xC0010058, CPU_CONF, CPU_K10_PLUS, },
- { 0xC0010060, 0xC0010060, CPU_CACHE, CPU_AMD_11, },
- { 0xC0010061, 0xC0010068, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010069, 0xC001006B, CPU_SMM, CPU_AMD_11, },
- { 0xC0010070, 0xC0010071, CPU_SMM, CPU_K10_PLUS, },
- { 0xC0010111, 0xC0010113, CPU_SMM, CPU_K8_PLUS, },
- { 0xC0010114, 0xC0010118, CPU_SVM, CPU_K10_PLUS, },
- { 0xC0010140, 0xC0010141, CPU_OSVM, CPU_K10_PLUS, },
- { 0xC0011022, 0xC0011023, CPU_CONF, CPU_K10_PLUS, },
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
- int flag;
-
- switch (model) {
- case 0x0501:
- case 0x0502:
- case 0x0504:
- flag = CPU_INTEL_PENTIUM;
- break;
- case 0x0601:
- case 0x0603:
- case 0x0605:
- case 0x0607:
- case 0x0608:
- case 0x060A:
- case 0x060B:
- flag = CPU_INTEL_P6;
- break;
- case 0x0609:
- case 0x060D:
- flag = CPU_INTEL_PENTIUM_M;
- break;
- case 0x060E:
- flag = CPU_INTEL_CORE;
- break;
- case 0x060F:
- case 0x0617:
- flag = CPU_INTEL_CORE2;
- break;
- case 0x061C:
- flag = CPU_INTEL_ATOM;
- break;
- case 0x0F00:
- case 0x0F01:
- case 0x0F02:
- case 0x0F03:
- case 0x0F04:
- flag = CPU_INTEL_XEON_P4;
- break;
- case 0x0F06:
- flag = CPU_INTEL_XEON_MP;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
- int flag;
-
- switch (model >> 8) {
- case 0x6:
- flag = CPU_AMD_K6;
- break;
- case 0x7:
- flag = CPU_AMD_K7;
- break;
- case 0x8:
- flag = CPU_AMD_K8;
- break;
- case 0xf:
- flag = CPU_AMD_0F;
- break;
- case 0x10:
- flag = CPU_AMD_10;
- break;
- case 0x11:
- flag = CPU_AMD_11;
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
- int flag;
-
- flag = per_cpu(cpu_model, cpu);
-
- switch (flag >> 16) {
- case X86_VENDOR_INTEL:
- flag = get_intel_modelflag(flag);
- break;
- case X86_VENDOR_AMD:
- flag = get_amd_modelflag(flag & 0xffff);
- break;
- default:
- flag = CPU_NONE;
- break;
- }
-
- return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
- int index;
-
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- index = ARRAY_SIZE(cpu_intel_range);
- break;
- case X86_VENDOR_AMD:
- index = ARRAY_SIZE(cpu_amd_range);
- break;
- default:
- index = 0;
- break;
- }
-
- return index;
-}
-
static int is_typeflag_valid(unsigned cpu, unsigned flag)
{
- unsigned vendor, modelflag;
- int i, index;
+ int i;
/* Standard Registers should be always valid */
if (flag >= CPU_TSS)
return 1;
- modelflag = per_cpu(cpu_modelflag, cpu);
- vendor = per_cpu(cpu_model, cpu) >> 16;
- index = get_cpu_range_count(cpu);
-
- for (i = 0; i < index; i++) {
- switch (vendor) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[i].model & modelflag) &&
- (cpu_intel_range[i].flag & flag))
- return 1;
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[i].model & modelflag) &&
- (cpu_amd_range[i].flag & flag))
- return 1;
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+ if (cpu_reg_range[i].flag == flag)
+ return 1;
}
/* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
int index, unsigned flag)
{
- unsigned modelflag;
-
- modelflag = per_cpu(cpu_modelflag, cpu);
- *max = 0;
- switch (per_cpu(cpu_model, cpu) >> 16) {
- case X86_VENDOR_INTEL:
- if ((cpu_intel_range[index].model & modelflag) &&
- (cpu_intel_range[index].flag & flag)) {
- *min = cpu_intel_range[index].min;
- *max = cpu_intel_range[index].max;
- }
- break;
- case X86_VENDOR_AMD:
- if ((cpu_amd_range[index].model & modelflag) &&
- (cpu_amd_range[index].flag & flag)) {
- *min = cpu_amd_range[index].min;
- *max = cpu_amd_range[index].max;
- }
- break;
- }
+ if (cpu_reg_range[index].flag == flag) {
+ *min = cpu_reg_range[index].min;
+ *max = cpu_reg_range[index].max;
+ } else
+ *max = 0;
return *max;
}
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
unsigned msr, msr_min, msr_max;
struct cpu_private *priv;
u32 low, high;
- int i, range;
+ int i;
if (seq) {
priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
}
}
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
continue;
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ unsigned int i, v, maxeilvt;
+
+ v = apic_read(APIC_EFEAT);
+ maxeilvt = (v >> 16) & 0xff;
+ seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+ seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
+ for (i = 0; i < maxeilvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+ }
+ }
+#endif /* CONFIG_X86_LOCAL_APIC */
seq_printf(seq, "\n MSR\t:\n");
}
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
{
struct dentry *cpu_dentry = NULL;
unsigned reg, reg_min, reg_max;
- int i, range, err = 0;
+ int i, err = 0;
char reg_dir[12];
u32 low, high;
- range = get_cpu_range_count(cpu);
-
- for (i = 0; i < range; i++) {
+ for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
cpu_base[type].flag))
continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
cpui = &cpu_data(cpu);
if (!cpu_has(cpui, X86_FEATURE_MSR))
continue;
- per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
- (cpui->x86 << 8) |
- (cpui->x86_model));
- per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
sprintf(cpu_dir, "cpu%d", cpu);
cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c83987547..f138c6c389b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
If in doubt, say N.
config X86_E_POWERSAVER
- tristate "VIA C7 Enhanced PowerSaver"
+ tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
select CPU_FREQ_TABLE
- depends on X86_32
+ depends on X86_32 && EXPERIMENTAL
help
- This adds the CPUFreq driver for VIA C7 processors.
+ This adds the CPUFreq driver for VIA C7 processors. However, this driver
+ does not have any safeguards to prevent operating the CPU out of spec
+ and is thus considered dangerous. Please use the regular ACPI cpufreq
+ driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
If in doubt, say N.
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 752e8c6b2c7..ae9b503220c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
{
struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
- if (cpu->x86_vendor != X86_VENDOR_INTEL ||
- !cpu_has(cpu, X86_FEATURE_EST))
- return 0;
-
- return 1;
+ return cpu_has(cpu, X86_FEATURE_EST);
}
static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c0..daed39ba261 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
{
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
unsigned node;
int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
+ int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
/* Work around errata */
- srat_detect_node();
+ srat_detect_node(c);
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e10..789efe217e1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
#include <asm/processor.h>
#include <asm/smp.h>
+#include <asm/k8.h>
#define LVL_1_INST 1
#define LVL_1_DATA 2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
unsigned long can_disable;
};
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
- {}
-};
-#endif
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
};
static const unsigned short __cpuinitconst assocs[] = {
- [1] = 1, [2] = 2, [4] = 4, [6] = 8,
- [8] = 16, [0xa] = 32, [0xb] = 48,
+ [1] = 1,
+ [2] = 2,
+ [4] = 4,
+ [6] = 8,
+ [8] = 16,
+ [0xa] = 32,
+ [0xb] = 48,
[0xc] = 64,
- [0xf] = 0xffff // ??
+ [0xd] = 96,
+ [0xe] = 128,
+ [0xf] = 0xffff /* fully associative - no way to show this currently */
};
static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
if (leaf == 3)
- eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+ eax->split.num_threads_sharing =
+ current_cpu_data.x86_max_cores - 1;
else
eax->split.num_threads_sharing = 0;
eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
{
if (index < 3)
return;
+
+ if (boot_cpu_data.x86 == 0x11)
+ return;
+
+ /* see erratum #382 */
+ if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+ return;
+
this_leaf->can_disable = 1;
}
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
#define to_object(k) container_of(k, struct _index_kobject, kobj)
#define to_attr(a) container_of(a, struct _cache_attr, attr)
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
- struct pci_dev *dev = NULL;
- int i;
-
- for (i = 0; i <= node; i++) {
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(&k8_nb_id[0], dev));
- if (!dev)
- break;
- }
- return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
- return NULL;
-}
-#endif
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- ssize_t ret = 0;
- int i;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned int reg = 0;
if (!this_leaf->can_disable)
- return sprintf(buf, "Feature not enabled\n");
-
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
return -EINVAL;
- }
- for (i = 0; i < 2; i++) {
- unsigned int reg;
+ if (!dev)
+ return -EINVAL;
- pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+ pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+ return sprintf(buf, "%x\n", reg);
+}
- ret += sprintf(buf, "%sEntry: %d\n", buf, i);
- ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
- buf,
- reg & 0x80000000 ? "Disabled" : "Allowed",
- reg & 0x40000000 ? "Disabled" : "Allowed");
- ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
- buf, (reg & 0x30000) >> 16, reg & 0xfff);
- }
- return ret;
+#define SHOW_CACHE_DISABLE(index) \
+static ssize_t \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+{ \
+ return show_cache_disable(this_leaf, buf, index); \
}
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
- size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count, unsigned int index)
{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int node = cpu_to_node(cpumask_first(mask));
- struct pci_dev *dev = NULL;
- unsigned int ret, index, val;
+ int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ int node = cpu_to_node(cpu);
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+ unsigned long val = 0;
+ unsigned int scrubber = 0;
if (!this_leaf->can_disable)
- return 0;
-
- if (strlen(buf) > 15)
return -EINVAL;
- ret = sscanf(buf, "%x %x", &index, &val);
- if (ret != 2)
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!dev)
return -EINVAL;
- if (index > 1)
+
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
val |= 0xc0000000;
- dev = get_k8_northbridge(node);
- if (!dev) {
- printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
- return -EINVAL;
- }
+
+ pci_read_config_dword(dev, 0x58, &scrubber);
+ scrubber &= ~0x1f000000;
+ pci_write_config_dword(dev, 0x58, scrubber);
pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
wbinvd();
pci_write_config_dword(dev, 0x1BC + index * 4, val);
+ return count;
+}
- return 1;
+#define STORE_CACHE_DISABLE(index) \
+static ssize_t \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+ const char *buf, size_t count) \
+{ \
+ return store_cache_disable(this_leaf, buf, count, index); \
}
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
struct _cache_attr {
struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+ show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+ show_cache_disable_1, store_cache_disable_1);
static struct attribute * default_attrs[] = {
&type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
&size.attr,
&shared_cpu_map.attr,
&shared_cpu_list.attr,
- &cache_disable.attr,
+ &cache_disable_0.attr,
+ &cache_disable_1.attr,
NULL
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index cef3ee30744..65a0fceedcd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
-#include <asm/apic.h>
asmlinkage void smp_thermal_interrupt(void)
{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04..1d584a18a50 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
*/
if (!is_cpu(INTEL) || disable_mtrr_trim)
return 0;
- rdmsr(MTRRdefType_MSR, def, dummy);
+ rdmsr(MSR_MTRRdefType, def, dummy);
def &= 0xff;
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d21d4fb161f..0543f69f0b2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,9 +20,9 @@ struct fixed_range_block {
};
static struct fixed_range_block fixed_range_blocks[] = {
- { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */
- { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */
- { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */
+ { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */
+ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */
+ { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */
{}
};
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
k8_check_syscfg_dram_mod_en();
- rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+ rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
- rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+ rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
- rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+ rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
vrs = mtrr_state.var_ranges;
- rdmsr(MTRRcap_MSR, lo, dummy);
+ rdmsr(MSR_MTRRcap, lo, dummy);
mtrr_state.have_fixed = (lo >> 8) & 1;
for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
if (mtrr_state.have_fixed)
get_fixed_ranges(mtrr_state.fixed_ranges);
- rdmsr(MTRRdefType_MSR, lo, dummy);
+ rdmsr(MSR_MTRRdefType, lo, dummy);
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
__flush_tlb();
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
}
static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
__flush_tlb();
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
static int generic_have_wrcomb(void)
{
unsigned long config, dummy;
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
return (config & (1 << 10));
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c..8fc248b5aea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
unsigned long config = 0, dummy;
if (use_intel()) {
- rdmsr(MTRRcap_MSR, config, dummy);
+ rdmsr(MSR_MTRRcap, config, dummy);
} else if (is_cpu(AMD))
config = 2;
else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347..7538b767f20 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,21 +5,6 @@
#include <linux/types.h>
#include <linux/stddef.h>
-#define MTRRcap_MSR 0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685..1f5fb1588d1 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
if (use_intel())
/* Save MTRR state */
- rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else were excluded at the top */
ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
{
if (use_intel())
/* Disable MTRRs, and set the default type to uncached */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
ctxt->deftype_hi);
else if (is_cpu(CYRIX))
/* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
/* Restore MTRRdefType */
if (use_intel())
/* Intel (P6) standard MTRRs */
- mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
else
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ctxt->ccr3);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 00000000000..895c82e7845
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1704 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2009 Jaswinder Singh Rajput
+ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+
+static u64 perf_counter_mask __read_mostly;
+
+struct cpu_hw_counters {
+ struct perf_counter *counters[X86_PMC_IDX_MAX];
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long interrupts;
+ int enabled;
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+ const char *name;
+ int version;
+ int (*handle_irq)(struct pt_regs *);
+ void (*disable_all)(void);
+ void (*enable_all)(void);
+ void (*enable)(struct hw_perf_counter *, int);
+ void (*disable)(struct hw_perf_counter *, int);
+ unsigned eventsel;
+ unsigned perfctr;
+ u64 (*event_map)(int);
+ u64 (*raw_event)(u64);
+ int max_events;
+ int num_counters;
+ int num_counters_fixed;
+ int counter_bits;
+ u64 counter_mask;
+ u64 max_period;
+ u64 intel_ctrl;
+};
+
+static struct x86_pmu x86_pmu __read_mostly;
+
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+ .enabled = 1,
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static const u64 intel_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+};
+
+static u64 intel_pmu_event_map(int event)
+{
+ return intel_perfmon_event_map[event];
+}
+
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
+ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
+ [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
+ [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static const u64 atom_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
+ [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
+ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
+ [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
+ [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+ [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+ [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 intel_pmu_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
+#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
+
+#define CORE_EVNTSEL_MASK \
+ (CORE_EVNTSEL_EVENT_MASK | \
+ CORE_EVNTSEL_UNIT_MASK | \
+ CORE_EVNTSEL_EDGE_MASK | \
+ CORE_EVNTSEL_INV_MASK | \
+ CORE_EVNTSEL_COUNTER_MASK)
+
+ return event & CORE_EVNTSEL_MASK;
+}
+
+static const u64 amd_0f_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
+ [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
+ [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
+ [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+};
+
+static u64 amd_pmu_event_map(int event)
+{
+ return amd_perfmon_event_map[event];
+}
+
+static u64 amd_pmu_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
+#define K7_EVNTSEL_INV_MASK 0x000800000ULL
+#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
+
+#define K7_EVNTSEL_MASK \
+ (K7_EVNTSEL_EVENT_MASK | \
+ K7_EVNTSEL_UNIT_MASK | \
+ K7_EVNTSEL_EDGE_MASK | \
+ K7_EVNTSEL_INV_MASK | \
+ K7_EVNTSEL_COUNTER_MASK)
+
+ return event & K7_EVNTSEL_MASK;
+}
+
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static u64
+x86_perf_counter_update(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ int shift = 64 - x86_pmu.counter_bits;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta;
+
+ /*
+ * Careful: an NMI might modify the previous counter value.
+ *
+ * Our tactic to handle this is to first atomically read and
+ * exchange a new raw count - then add that new-prev delta
+ * count to the generic counter atomically:
+ */
+again:
+ prev_raw_count = atomic64_read(&hwc->prev_count);
+ rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+ if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count)
+ goto again;
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (counter-)time and add that to the generic counter.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ atomic64_add(delta, &counter->count);
+ atomic64_sub(delta, &hwc->period_left);
+
+ return new_raw_count;
+}
+
+static atomic_t active_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+ int i;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ disable_lapic_nmi_watchdog();
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
+ goto perfctr_fail;
+ }
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
+ goto eventsel_fail;
+ }
+
+ return true;
+
+eventsel_fail:
+ for (i--; i >= 0; i--)
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+
+ i = x86_pmu.num_counters;
+
+perfctr_fail:
+ for (i--; i >= 0; i--)
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+
+ return false;
+}
+
+static void release_pmc_hardware(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ release_perfctr_nmi(x86_pmu.perfctr + i);
+ release_evntsel_nmi(x86_pmu.eventsel + i);
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+ if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
+ release_pmc_hardware();
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+}
+
+static inline int x86_pmu_initialized(void)
+{
+ return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+ unsigned int cache_type, cache_op, cache_result;
+ u64 config, val;
+
+ config = attr->config;
+
+ cache_type = (config >> 0) & 0xff;
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+ return -EINVAL;
+
+ cache_op = (config >> 8) & 0xff;
+ if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+ return -EINVAL;
+
+ cache_result = (config >> 16) & 0xff;
+ if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return -EINVAL;
+
+ val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+ if (val == 0)
+ return -ENOENT;
+
+ if (val == -1)
+ return -EINVAL;
+
+ hwc->config |= val;
+
+ return 0;
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_counter_init(struct perf_counter *counter)
+{
+ struct perf_counter_attr *attr = &counter->attr;
+ struct hw_perf_counter *hwc = &counter->hw;
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_counters)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
+ err = -EBUSY;
+ else
+ atomic_inc(&active_counters);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ hwc->config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to.
+ */
+ if (!attr->exclude_user)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!attr->exclude_kernel)
+ hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (!hwc->sample_period) {
+ hwc->sample_period = x86_pmu.max_period;
+ hwc->last_period = hwc->sample_period;
+ atomic64_set(&hwc->period_left, hwc->sample_period);
+ }
+
+ counter->destroy = hw_perf_counter_destroy;
+
+ /*
+ * Raw event type provide the config in the event structure
+ */
+ if (attr->type == PERF_TYPE_RAW) {
+ hwc->config |= x86_pmu.raw_event(attr->config);
+ return 0;
+ }
+
+ if (attr->type == PERF_TYPE_HW_CACHE)
+ return set_ext_hw_attr(hwc, attr);
+
+ if (attr->config >= x86_pmu.max_events)
+ return -EINVAL;
+ /*
+ * The generic map:
+ */
+ hwc->config |= x86_pmu.event_map(attr->config);
+
+ return 0;
+}
+
+static void intel_pmu_disable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+}
+
+static void amd_pmu_disable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (!cpuc->enabled)
+ return;
+
+ cpuc->enabled = 0;
+ /*
+ * ensure we write the disable before we start disabling the
+ * counters proper, so that amd_pmu_enable_counter() does the
+ * right thing.
+ */
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+ continue;
+ val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_disable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ return x86_pmu.disable_all();
+}
+
+static void intel_pmu_enable_all(void)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+}
+
+static void amd_pmu_enable_all(void)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ int idx;
+
+ if (cpuc->enabled)
+ return;
+
+ cpuc->enabled = 1;
+ barrier();
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ u64 val;
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+ continue;
+ val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+ }
+}
+
+void hw_perf_enable(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+ x86_pmu.enable_all();
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+ u64 status;
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+ return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ int err;
+ err = checking_wrmsrl(hwc->config_base + idx,
+ hwc->config);
+}
+
+static inline void
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, mask;
+ int err;
+
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_disable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static int
+x86_perf_counter_set_period(struct perf_counter *counter,
+ struct hw_perf_counter *hwc, int idx)
+{
+ s64 left = atomic64_read(&hwc->period_left);
+ s64 period = hwc->sample_period;
+ int err, ret = 0;
+
+ /*
+ * If we are way outside a reasoable range then just skip forward:
+ */
+ if (unlikely(left <= -period)) {
+ left = period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+
+ if (unlikely(left <= 0)) {
+ left += period;
+ atomic64_set(&hwc->period_left, left);
+ hwc->last_period = period;
+ ret = 1;
+ }
+ /*
+ * Quirk: certain CPUs dont like it if just 1 event is left:
+ */
+ if (unlikely(left < 2))
+ left = 2;
+
+ if (left > x86_pmu.max_period)
+ left = x86_pmu.max_period;
+
+ per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+ /*
+ * The hw counter starts counting from this counter offset,
+ * mark it to be able to extra future deltas:
+ */
+ atomic64_set(&hwc->prev_count, (u64)-left);
+
+ err = checking_wrmsrl(hwc->counter_base + idx,
+ (u64)(-left) & x86_pmu.counter_mask);
+
+ return ret;
+}
+
+static inline void
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
+{
+ int idx = __idx - X86_PMC_IDX_FIXED;
+ u64 ctrl_val, bits, mask;
+ int err;
+
+ /*
+ * Enable IRQ generation (0x8),
+ * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+ * if requested:
+ */
+ bits = 0x8ULL;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+ bits |= 0x2;
+ if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ bits |= 0x1;
+ bits <<= (idx * 4);
+ mask = 0xfULL << (idx * 4);
+
+ rdmsrl(hwc->config_base, ctrl_val);
+ ctrl_val &= ~mask;
+ ctrl_val |= bits;
+ err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+ intel_pmu_enable_fixed(hwc, idx);
+ return;
+ }
+
+ x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+ if (cpuc->enabled)
+ x86_pmu_enable_counter(hwc, idx);
+ else
+ x86_pmu_disable_counter(hwc, idx);
+}
+
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
+{
+ unsigned int event;
+
+ if (!x86_pmu.num_counters_fixed)
+ return -1;
+
+ event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
+ return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
+ return X86_PMC_IDX_FIXED_CPU_CYCLES;
+ if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
+ return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
+ return -1;
+}
+
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
+static int x86_pmu_enable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx;
+
+ idx = fixed_mode_idx(counter, hwc);
+ if (idx >= 0) {
+ /*
+ * Try to get the fixed counter, if that is already taken
+ * then try to get a generic counter:
+ */
+ if (test_and_set_bit(idx, cpuc->used_mask))
+ goto try_generic;
+
+ hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ /*
+ * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+ * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+ */
+ hwc->counter_base =
+ MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
+ hwc->idx = idx;
+ } else {
+ idx = hwc->idx;
+ /* Try to get the previous generic counter again */
+ if (test_and_set_bit(idx, cpuc->used_mask)) {
+try_generic:
+ idx = find_first_zero_bit(cpuc->used_mask,
+ x86_pmu.num_counters);
+ if (idx == x86_pmu.num_counters)
+ return -EAGAIN;
+
+ set_bit(idx, cpuc->used_mask);
+ hwc->idx = idx;
+ }
+ hwc->config_base = x86_pmu.eventsel;
+ hwc->counter_base = x86_pmu.perfctr;
+ }
+
+ perf_counters_lapic_init();
+
+ x86_pmu.disable(hwc, idx);
+
+ cpuc->counters[idx] = counter;
+ set_bit(idx, cpuc->active_mask);
+
+ x86_perf_counter_set_period(counter, hwc, idx);
+ x86_pmu.enable(hwc, idx);
+
+ return 0;
+}
+
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+
+ if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+ cpuc->counters[hwc->idx] != counter))
+ return;
+
+ x86_pmu.enable(hwc, hwc->idx);
+}
+
+void perf_counter_print_debug(void)
+{
+ u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ struct cpu_hw_counters *cpuc;
+ unsigned long flags;
+ int cpu, idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ if (x86_pmu.version >= 2) {
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+ rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+ pr_info("\n");
+ pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
+ pr_info("CPU#%d: status: %016llx\n", cpu, status);
+ pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
+ pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ }
+ pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+ rdmsrl(x86_pmu.perfctr + idx, pmc_count);
+
+ prev_left = per_cpu(prev_left[idx], cpu);
+
+ pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
+ cpu, idx, pmc_ctrl);
+ pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
+ cpu, idx, prev_left);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+ pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+ cpu, idx, pmc_count);
+ }
+ local_irq_restore(flags);
+}
+
+static void x86_pmu_disable(struct perf_counter *counter)
+{
+ struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+
+ /*
+ * Must be done before we disable, otherwise the nmi handler
+ * could reenable again:
+ */
+ clear_bit(idx, cpuc->active_mask);
+ x86_pmu.disable(hwc, idx);
+
+ /*
+ * Make sure the cleared pointer becomes visible before we
+ * (potentially) free the counter:
+ */
+ barrier();
+
+ /*
+ * Drain the remaining delta count out of a counter
+ * that we are disabling:
+ */
+ x86_perf_counter_update(counter, hwc, idx);
+ cpuc->counters[idx] = NULL;
+ clear_bit(idx, cpuc->used_mask);
+}
+
+/*
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
+ */
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
+{
+ struct hw_perf_counter *hwc = &counter->hw;
+ int idx = hwc->idx;
+ int ret;
+
+ x86_perf_counter_update(counter, hwc, idx);
+ ret = x86_perf_counter_set_period(counter, hwc, idx);
+
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+ intel_pmu_enable_counter(hwc, idx);
+
+ return ret;
+}
+
+static void intel_pmu_reset(void)
+{
+ unsigned long flags;
+ int idx;
+
+ if (!x86_pmu.num_counters)
+ return;
+
+ local_irq_save(flags);
+
+ printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+ checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
+ }
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+ checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+ }
+
+ local_irq_restore(flags);
+}
+
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ int bit, cpu, loops;
+ u64 ack, status;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ perf_disable();
+ status = intel_pmu_get_status();
+ if (!status) {
+ perf_enable();
+ return 0;
+ }
+
+ loops = 0;
+again:
+ if (++loops > 100) {
+ WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+ perf_counter_print_debug();
+ intel_pmu_reset();
+ perf_enable();
+ return 1;
+ }
+
+ inc_irq_stat(apic_perf_irqs);
+ ack = status;
+ for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+ struct perf_counter *counter = cpuc->counters[bit];
+
+ clear_bit(bit, (unsigned long *) &status);
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ if (!intel_pmu_save_and_restart(counter))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ intel_pmu_disable_counter(&counter->hw, bit);
+ }
+
+ intel_pmu_ack_status(ack);
+
+ /*
+ * Repeat if there is more work to be done:
+ */
+ status = intel_pmu_get_status();
+ if (status)
+ goto again;
+
+ perf_enable();
+
+ return 1;
+}
+
+static int amd_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_counters *cpuc;
+ struct perf_counter *counter;
+ struct hw_perf_counter *hwc;
+ int cpu, idx, handled = 0;
+ u64 val;
+
+ data.regs = regs;
+ data.addr = 0;
+
+ cpu = smp_processor_id();
+ cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ counter = cpuc->counters[idx];
+ hwc = &counter->hw;
+
+ val = x86_perf_counter_update(counter, hwc, idx);
+ if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+ continue;
+
+ /*
+ * counter overflow
+ */
+ handled = 1;
+ data.period = counter->hw.last_period;
+
+ if (!x86_perf_counter_set_period(counter, hwc, idx))
+ continue;
+
+ if (perf_counter_overflow(counter, 1, &data))
+ amd_pmu_disable_counter(hwc, idx);
+ }
+
+ if (handled)
+ inc_irq_stat(apic_perf_irqs);
+
+ return handled;
+}
+
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+ irq_enter();
+ ack_APIC_irq();
+ inc_irq_stat(apic_pending_irqs);
+ perf_counter_do_pending();
+ irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+ apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
+void perf_counters_lapic_init(void)
+{
+ if (!x86_pmu_initialized())
+ return;
+
+ /*
+ * Always use NMI for PMU
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+
+ if (!atomic_read(&active_counters))
+ return NOTIFY_DONE;
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ /*
+ * Can't rely on the handled return value to say it was our NMI, two
+ * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+ *
+ * If the first NMI handles both, the latter will be empty and daze
+ * the CPU.
+ */
+ x86_pmu.handle_irq(regs);
+
+ return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+ .notifier_call = perf_counter_nmi_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static struct x86_pmu intel_pmu = {
+ .name = "Intel",
+ .handle_irq = intel_pmu_handle_irq,
+ .disable_all = intel_pmu_disable_all,
+ .enable_all = intel_pmu_enable_all,
+ .enable = intel_pmu_enable_counter,
+ .disable = intel_pmu_disable_counter,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .raw_event = intel_pmu_raw_event,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ /*
+ * Intel PMCs cannot be accessed sanely above 32 bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic counter period:
+ */
+ .max_period = (1ULL << 31) - 1,
+};
+
+static struct x86_pmu amd_pmu = {
+ .name = "AMD",
+ .handle_irq = amd_pmu_handle_irq,
+ .disable_all = amd_pmu_disable_all,
+ .enable_all = amd_pmu_enable_all,
+ .enable = amd_pmu_enable_counter,
+ .disable = amd_pmu_disable_counter,
+ .eventsel = MSR_K7_EVNTSEL0,
+ .perfctr = MSR_K7_PERFCTR0,
+ .event_map = amd_pmu_event_map,
+ .raw_event = amd_pmu_raw_event,
+ .max_events = ARRAY_SIZE(amd_perfmon_event_map),
+ .num_counters = 4,
+ .counter_bits = 48,
+ .counter_mask = (1ULL << 48) - 1,
+ /* use highest bit to detect overflow */
+ .max_period = (1ULL << 47) - 1,
+};
+
+static int intel_pmu_init(void)
+{
+ union cpuid10_edx edx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int ebx;
+ int version;
+
+ if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return -ENODEV;
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Branch Misses Retired Event or not.
+ */
+ cpuid(10, &eax.full, &ebx, &unused, &edx.full);
+ if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ return -ENODEV;
+
+ version = eax.split.version_id;
+ if (version < 2)
+ return -ENODEV;
+
+ x86_pmu = intel_pmu;
+ x86_pmu.version = version;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.counter_bits = eax.split.bit_width;
+ x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+
+ /*
+ * Quirk: v2 perfmon does not report fixed-purpose counters, so
+ * assume at least 3 counters:
+ */
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
+ /*
+ * Install the hw-cache-events table:
+ */
+ switch (boot_cpu_data.x86_model) {
+ case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+ case 29: /* six-core 45 nm xeon "Dunnington" */
+ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Core2 events, ");
+ break;
+ default:
+ case 26:
+ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Nehalem/Corei7 events, ");
+ break;
+ case 28:
+ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Atom events, ");
+ break;
+ }
+ return 0;
+}
+
+static int amd_pmu_init(void)
+{
+ x86_pmu = amd_pmu;
+
+ switch (boot_cpu_data.x86) {
+ case 0x0f:
+ case 0x10:
+ case 0x11:
+ memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("AMD Family 0f/10/11 events, ");
+ break;
+ }
+ return 0;
+}
+
+void __init init_hw_perf_counters(void)
+{
+ int err;
+
+ pr_info("Performance Counters: ");
+
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ err = intel_pmu_init();
+ break;
+ case X86_VENDOR_AMD:
+ err = amd_pmu_init();
+ break;
+ default:
+ return;
+ }
+ if (err != 0) {
+ pr_cont("no PMU driver, software counters only.\n");
+ return;
+ }
+
+ pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
+ WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ }
+ perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+ perf_max_counters = x86_pmu.num_counters;
+
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
+ WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ }
+
+ perf_counter_mask |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+
+ perf_counters_lapic_init();
+ register_die_notifier(&perf_counter_nmi_notifier);
+
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.counter_bits);
+ pr_info("... generic counters: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... counter mask: %016Lx\n", perf_counter_mask);
+}
+
+static inline void x86_pmu_read(struct perf_counter *counter)
+{
+ x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
+static const struct pmu pmu = {
+ .enable = x86_pmu_enable,
+ .disable = x86_pmu_disable,
+ .read = x86_pmu_read,
+ .unthrottle = x86_pmu_unthrottle,
+};
+
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
+{
+ int err;
+
+ err = __hw_perf_counter_init(counter);
+ if (err)
+ return ERR_PTR(err);
+
+ return &pmu;
+}
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+ if (entry->nr < MAX_STACK_DEPTH)
+ entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ /* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+ /* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+ /* Don't bother with IRQ stacks for now */
+ return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+ struct perf_callchain_entry *entry = data;
+
+ if (reliable)
+ callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+ .warning = backtrace_warning,
+ .warning_symbol = backtrace_warning_symbol,
+ .stack = backtrace_stack,
+ .address = backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ unsigned long bp;
+ char *stack;
+ int nr = entry->nr;
+
+ callchain_store(entry, instruction_pointer(regs));
+
+ stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+ bp = frame_pointer(regs);
+#else
+ bp = 0;
+#endif
+
+ dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+ entry->kernel = entry->nr - nr;
+}
+
+
+struct stack_frame {
+ const void __user *next_fp;
+ unsigned long return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+ int ret;
+
+ if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+ return 0;
+
+ ret = 1;
+ pagefault_disable();
+ if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+ ret = 0;
+ pagefault_enable();
+
+ return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ struct stack_frame frame;
+ const void __user *fp;
+ int nr = entry->nr;
+
+ regs = (struct pt_regs *)current->thread.sp0 - 1;
+ fp = (void __user *)regs->bp;
+
+ callchain_store(entry, regs->ip);
+
+ while (entry->nr < MAX_STACK_DEPTH) {
+ frame.next_fp = NULL;
+ frame.return_address = 0;
+
+ if (!copy_stack_frame(fp, &frame))
+ break;
+
+ if ((unsigned long)fp < user_stack_pointer(regs))
+ break;
+
+ callchain_store(entry, frame.return_address);
+ fp = frame.next_fp;
+ }
+
+ entry->user = entry->nr - nr;
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+ int is_user;
+
+ if (!regs)
+ return;
+
+ is_user = user_mode(regs);
+
+ if (!current || current->pid == 0)
+ return;
+
+ if (is_user && current->state != TASK_RUNNING)
+ return;
+
+ if (!is_user)
+ perf_callchain_kernel(regs, entry);
+
+ if (current->mm)
+ perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ struct perf_callchain_entry *entry;
+
+ if (in_nmi())
+ entry = &__get_cpu_var(nmi_entry);
+ else
+ entry = &__get_cpu_var(irq_entry);
+
+ entry->nr = 0;
+ entry->hv = 0;
+ entry->kernel = 0;
+ entry->user = 0;
+
+ perf_do_callchain(regs, entry);
+
+ return entry;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e3..d6f5b9fbde3 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
-#include <asm/genapic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+#include <asm/perf_counter.h>
struct nmi_watchdog_ctlblk {
unsigned int cccr_msr;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765..48bfe138603 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,45 +19,61 @@
* Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
*/
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
#include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/kernel.h>
+#include <linux/trace_clock.h>
+
+#include <asm/ds.h>
+#include "ds_selftest.h"
/*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
*/
struct ds_configuration {
- /* the name of the configuration */
- const char *name;
- /* the size of one pointer-typed field in the DS structure and
- in the BTS and PEBS buffers in bytes;
- this covers the first 8 DS fields related to buffer management. */
- unsigned char sizeof_field;
- /* the size of a BTS/PEBS record in bytes */
- unsigned char sizeof_rec[2];
- /* a series of bit-masks to control various features indexed
- * by enum ds_feature */
- unsigned long ctl[dsf_ctl_max];
+ /* The name of the configuration: */
+ const char *name;
+
+ /* The size of pointer-typed fields in DS, BTS, and PEBS: */
+ unsigned char sizeof_ptr_field;
+
+ /* The size of a BTS/PEBS record in bytes: */
+ unsigned char sizeof_rec[2];
+
+ /* The number of pebs counter reset values in the DS structure. */
+ unsigned char nr_counter_reset;
+
+ /* Control bit-masks indexed by enum ds_feature: */
+ unsigned long ctl[dsf_ctl_max];
};
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
+
+
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS 0x80
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS (3 * 8)
-#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT (1 << 3)
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
- ds_cfg.ctl[dsf_bts_overflow])
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS 8
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE 8
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL \
+ ( ds_cfg.ctl[dsf_bts] | \
+ ds_cfg.ctl[dsf_bts_kernel] | \
+ ds_cfg.ctl[dsf_bts_user] | \
+ ds_cfg.ctl[dsf_bts_overflow] )
/*
* A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
* to identify tracers.
*/
struct ds_tracer {
- /* the DS context (partially) owned by this tracer */
- struct ds_context *context;
- /* the buffer provided on ds_request() and its size in bytes */
- void *buffer;
- size_t size;
+ /* The DS context (partially) owned by this tracer. */
+ struct ds_context *context;
+ /* The buffer provided on ds_request() and its size in bytes. */
+ void *buffer;
+ size_t size;
};
struct bts_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct bts_trace trace;
- /* buffer overflow notification function */
- bts_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct bts_trace trace;
+
+ /* Buffer overflow notification function: */
+ bts_ovfl_callback_t ovfl;
+
+ /* Active flags affecting trace collection. */
+ unsigned int flags;
};
struct pebs_tracer {
- /* the common DS part */
- struct ds_tracer ds;
- /* the trace including the DS configuration */
- struct pebs_trace trace;
- /* buffer overflow notification function */
- pebs_ovfl_callback_t ovfl;
+ /* The common DS part: */
+ struct ds_tracer ds;
+
+ /* The trace including the DS configuration: */
+ struct pebs_trace trace;
+
+ /* Buffer overflow notification function: */
+ pebs_ovfl_callback_t ovfl;
};
/*
@@ -97,6 +120,7 @@ struct pebs_tracer {
*
* The DS configuration consists of the following fields; different
* architetures vary in the size of those fields.
+ *
* - double-word aligned base linear address of the BTS buffer
* - write pointer into the BTS buffer
* - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
};
enum ds_qualifier {
- ds_bts = 0,
+ ds_bts = 0,
ds_pebs
};
-static inline unsigned long ds_get(const unsigned char *base,
- enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
return *(unsigned long *)base;
}
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
- enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+ unsigned long value)
{
- base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
(*(unsigned long *)base) = value;
}
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
*/
static DEFINE_SPINLOCK(ds_lock);
-
/*
* We either support (system-wide) per-cpu or per-thread allocation.
* We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
*/
static atomic_t tracers = ATOMIC_INIT(0);
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
{
- if (task)
+ int error;
+
+ spin_lock_irq(&ds_lock);
+
+ if (task) {
+ error = -EPERM;
+ if (atomic_read(&tracers) < 0)
+ goto out;
atomic_inc(&tracers);
- else
+ } else {
+ error = -EPERM;
+ if (atomic_read(&tracers) > 0)
+ goto out;
atomic_dec(&tracers);
+ }
+
+ error = 0;
+out:
+ spin_unlock_irq(&ds_lock);
+ return error;
}
static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
atomic_inc(&tracers);
}
-static inline int check_tracer(struct task_struct *task)
-{
- return task ?
- (atomic_read(&tracers) >= 0) :
- (atomic_read(&tracers) <= 0);
-}
-
-
/*
* The DS context is either attached to a thread or to a cpu:
* - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
* deallocated when the last user puts the context.
*/
struct ds_context {
- /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
- unsigned char ds[MAX_SIZEOF_DS];
- /* the owner of the BTS and PEBS configuration, respectively */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
- /* use count */
- unsigned long count;
- /* a pointer to the context location inside the thread_struct
- * or the per_cpu context array */
- struct ds_context **this;
- /* a pointer to the task owning this context, or NULL, if the
- * context is owned by a cpu */
- struct task_struct *task;
-};
+ /* The DS configuration; goes into MSR_IA32_DS_AREA: */
+ unsigned char ds[MAX_SIZEOF_DS];
+
+ /* The owner of the BTS and PEBS configuration, respectively: */
+ struct bts_tracer *bts_master;
+ struct pebs_tracer *pebs_master;
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+ /* Use count: */
+ unsigned long count;
-#define system_context per_cpu(system_context_array, smp_processor_id())
+ /* Pointer to the context pointer field: */
+ struct ds_context **this;
+
+ /* The traced task; NULL for cpu tracing: */
+ struct task_struct *task;
+
+ /* The traced cpu; only valid if task is NULL: */
+ int cpu;
+};
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
{
struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &system_context);
+ (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
struct ds_context *context = NULL;
struct ds_context *new_context = NULL;
- unsigned long irq;
/* Chances are small that we already have a context. */
new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
if (!new_context)
return NULL;
- spin_lock_irqsave(&ds_lock, irq);
+ spin_lock_irq(&ds_lock);
context = *p_context;
- if (!context) {
+ if (likely(!context)) {
context = new_context;
context->this = p_context;
context->task = task;
+ context->cpu = cpu;
context->count = 0;
- if (task)
- set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- if (!task || (task == current))
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
*p_context = context;
}
context->count++;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
if (context != new_context)
kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
return context;
}
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
{
+ struct task_struct *task;
unsigned long irq;
if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
*(context->this) = NULL;
- if (context->task)
- clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ task = context->task;
+
+ if (task)
+ clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
- if (!context->task || (context->task == current))
- wrmsrl(MSR_IA32_DS_AREA, 0);
+ /*
+ * We leave the (now dangling) pointer to the DS configuration in
+ * the DS_AREA msr. This is as good or as bad as replacing it with
+ * NULL - the hardware would crash if we enabled tracing.
+ *
+ * This saves us some problems with having to write an msr on a
+ * different cpu while preventing others from doing the same for the
+ * next context for that same cpu.
+ */
spin_unlock_irqrestore(&ds_lock, irq);
+ /* The context might still be in use for context switching. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
kfree(context);
}
+static void ds_install_ds_area(struct ds_context *context)
+{
+ unsigned long ds;
+
+ ds = (unsigned long)context->ds;
+
+ /*
+ * There is a race between the bts master and the pebs master.
+ *
+ * The thread/cpu access is synchronized via get/put_cpu() for
+ * task tracing and via wrmsr_on_cpu for cpu tracing.
+ *
+ * If bts and pebs are collected for the same task or same cpu,
+ * the same confiuration is written twice.
+ */
+ if (context->task) {
+ get_cpu();
+ if (context->task == current)
+ wrmsrl(MSR_IA32_DS_AREA, ds);
+ set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+ put_cpu();
+ } else
+ wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
/*
* Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
* The remainder of any partially written record is zeroed out.
*
* context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual: the buffer type
+ * record: the data to write
+ * size: the size of the data
*/
static int ds_write(struct ds_context *context, enum ds_qualifier qual,
const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
unsigned long write_size, adj_write_size;
/*
- * write as much as possible without producing an
+ * Write as much as possible without producing an
* overflow interrupt.
*
- * interrupt_threshold must either be
+ * Interrupt_threshold must either be
* - bigger than absolute_maximum or
* - point to a record between buffer_base and absolute_maximum
*
- * index points to a valid record.
+ * Index points to a valid record.
*/
base = ds_get(context->ds, qual, ds_buffer_base);
index = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
write_end = min(end, int_th);
- /* if we are already beyond the interrupt threshold,
- * we fill the entire buffer */
+ /*
+ * If we are already beyond the interrupt threshold,
+ * we fill the entire buffer.
+ */
if (write_end <= index)
write_end = end;
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
adj_write_size *= ds_cfg.sizeof_rec[qual];
- /* zero out trailing bytes */
+ /* Zero out trailing bytes. */
memset((char *)index + write_size, 0,
adj_write_size - write_size);
index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
* Later architectures use 64bit pointers throughout, whereas earlier
* architectures use 32bit pointers in 32bit mode.
*
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
* - the field size stored in the DS configuration
* - the relative field position
*
@@ -431,23 +501,23 @@ enum bts_field {
bts_to,
bts_flags,
- bts_qual = bts_from,
- bts_jiffies = bts_to,
- bts_pid = bts_flags,
+ bts_qual = bts_from,
+ bts_clock = bts_to,
+ bts_pid = bts_flags,
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+ bts_qual_mask = (bts_qual_max - 1),
+ bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
};
static inline unsigned long bts_get(const char *base, enum bts_field field)
{
- base += (ds_cfg.sizeof_field * field);
+ base += (ds_cfg.sizeof_ptr_field * field);
return *(unsigned long *)base;
}
static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- base += (ds_cfg.sizeof_field * field);;
+ base += (ds_cfg.sizeof_ptr_field * field);;
(*(unsigned long *)base) = val;
}
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
*
* return: bytes read/written on success; -Eerrno, otherwise
*/
-static int bts_read(struct bts_tracer *tracer, const void *at,
- struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
{
if (!tracer)
return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
memset(out, 0, sizeof(*out));
if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
- out->variant.timestamp.pid = bts_get(at, bts_pid);
+ out->variant.event.clock = bts_get(at, bts_clock);
+ out->variant.event.pid = bts_get(at, bts_pid);
} else {
out->qualifier = bts_branch;
out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
case bts_task_arrives:
case bts_task_departs:
bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
- bts_set(raw, bts_pid, in->variant.timestamp.pid);
+ bts_set(raw, bts_clock, in->variant.event.clock);
+ bts_set(raw, bts_pid, in->variant.event.pid);
break;
default:
return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
unsigned int flags) {
unsigned long buffer, adj;
- /* adjust the buffer address and size to meet alignment
+ /*
+ * Adjust the buffer address and size to meet alignment
* constraints:
* - buffer is double-word aligned
* - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
trace->begin = (void *)buffer;
trace->top = trace->begin;
trace->end = (void *)(buffer + size);
- /* The value for 'no threshold' is -1, which will set the
+ /*
+ * The value for 'no threshold' is -1, which will set the
* threshold outside of the buffer, just like we want it.
*/
+ ith *= ds_cfg.sizeof_rec[qual];
trace->ith = (void *)(buffer + size - ith);
trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
enum ds_qualifier qual, struct task_struct *task,
- void *base, size_t size, size_t th, unsigned int flags)
+ int cpu, void *base, size_t size, size_t th)
{
struct ds_context *context;
int error;
+ size_t req_size;
+
+ error = -EOPNOTSUPP;
+ if (!ds_cfg.sizeof_rec[qual])
+ goto out;
error = -EINVAL;
if (!base)
goto out;
- /* we require some space to do alignment adjustments below */
+ req_size = ds_cfg.sizeof_rec[qual];
+ /* We might need space for alignment adjustments. */
+ if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+ req_size += DS_ALIGNMENT;
+
error = -EINVAL;
- if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+ if (size < req_size)
goto out;
if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
tracer->size = size;
error = -ENOMEM;
- context = ds_get_context(task);
+ context = ds_get_context(task, cpu);
if (!context)
goto out;
tracer->context = context;
- ds_init_ds_trace(trace, qual, base, size, th, flags);
+ /*
+ * Defer any tracer-specific initialization work for the context until
+ * context ownership has been clarified.
+ */
error = 0;
out:
return error;
}
-struct bts_tracer *ds_request_bts(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct bts_tracer *tracer;
- unsigned long irq;
int error;
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
- if (!ds_cfg.ctl[dsf_bts])
+ if (ovfl)
goto out;
- /* buffer overflow notification is not yet implemented */
- error = -EOPNOTSUPP;
- if (ovfl)
+ error = get_tracer(task);
+ if (error < 0)
goto out;
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, base, size, th, flags);
+ ds_bts, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the bts part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->bts_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->bts_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the bts part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+ ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ ds_install_ds_area(tracer->ds.context);
tracer->trace.read = bts_read;
tracer->trace.write = bts_write;
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+ /* Start tracing. */
ds_resume_bts(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+ void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+ bts_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl, size_t th,
+ unsigned int flags)
{
struct pebs_tracer *tracer;
- unsigned long irq;
int error;
- /* buffer overflow notification is not yet implemented */
+ /* Buffer overflow notification is not yet implemented. */
error = -EOPNOTSUPP;
if (ovfl)
goto out;
+ error = get_tracer(task);
+ if (error < 0)
+ goto out;
+
error = -ENOMEM;
tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
if (!tracer)
- goto out;
+ goto out_put_tracer;
tracer->ovfl = ovfl;
+ /* Do some more error checking and acquire a tracing context. */
error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, base, size, th, flags);
+ ds_pebs, task, cpu, base, size, th);
if (error < 0)
goto out_tracer;
- spin_lock_irqsave(&ds_lock, irq);
-
- error = -EPERM;
- if (!check_tracer(task))
- goto out_unlock;
- get_tracer(task);
+ /* Claim the pebs part of the tracing context we acquired above. */
+ spin_lock_irq(&ds_lock);
error = -EPERM;
if (tracer->ds.context->pebs_master)
- goto out_put_tracer;
+ goto out_unlock;
tracer->ds.context->pebs_master = tracer;
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
+ /*
+ * Now that we own the pebs part of the context, let's complete the
+ * initialization for that part.
+ */
+ ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+ ds_install_ds_area(tracer->ds.context);
+
+ /* Start tracing. */
ds_resume_pebs(tracer);
return tracer;
- out_put_tracer:
- put_tracer(task);
out_unlock:
- spin_unlock_irqrestore(&ds_lock, irq);
+ spin_unlock_irq(&ds_lock);
ds_put_context(tracer->ds.context);
out_tracer:
kfree(tracer);
+ out_put_tracer:
+ put_tracer(task);
out:
return ERR_PTR(error);
}
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+ void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
{
- if (!tracer)
- return;
+ return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
- ds_suspend_bts(tracer);
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+ pebs_ovfl_callback_t ovfl,
+ size_t th, unsigned int flags)
+{
+ return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
tracer->ds.context->bts_master = NULL;
- put_tracer(tracer->ds.context->task);
+ /* Make sure tracing stopped and the tracer is not in use. */
+ if (task && (task != current))
+ wait_task_context_switch(task);
+
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_bts(struct bts_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_bts(tracer);
+ ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_bts_noirq(tracer);
+ ds_free_bts(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+ unsigned long debugctlmsr)
+{
+ task->thread.debugctlmsr = debugctlmsr;
+
+ get_cpu();
+ if (task == current)
+ update_debugctlmsr(debugctlmsr);
+ put_cpu();
+}
+
void ds_suspend_bts(struct bts_tracer *tracer)
{
struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
if (!tracer)
return;
+ tracer->flags = 0;
+
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+ WARN_ON(!task && irqs_disabled());
- if (task) {
- task->thread.debugctlmsr &= ~BTS_CONTROL;
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr &= ~BTS_CONTROL;
- if (!task->thread.debugctlmsr)
- clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
}
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
{
struct task_struct *task;
- unsigned long control;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
if (!tracer)
- return;
+ return 0;
+
+ tracer->flags = 0;
task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr &= ~BTS_CONTROL;
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+ unsigned long control;
control = ds_cfg.ctl[dsf_bts];
if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
if (!(tracer->trace.ds.flags & BTS_USER))
control |= ds_cfg.ctl[dsf_bts_user];
- if (task) {
- task->thread.debugctlmsr |= control;
- set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
- }
-
- if (!task || (task == current))
- update_debugctlmsr(get_debugctlmsr() | control);
+ return control;
}
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
{
+ struct task_struct *task;
+ unsigned long debugctlmsr;
+ int cpu;
+
if (!tracer)
return;
- ds_suspend_pebs(tracer);
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ WARN_ON(!task && irqs_disabled());
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr_on_cpu(cpu));
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long debugctlmsr, irq;
+ int cpu, error = 0;
+
+ if (!tracer)
+ return 0;
+
+ tracer->flags = tracer->trace.ds.flags;
+
+ task = tracer->ds.context->task;
+ cpu = tracer->ds.context->cpu;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task && (cpu != smp_processor_id()))
+ goto out;
+
+ debugctlmsr = (task ?
+ task->thread.debugctlmsr :
+ get_debugctlmsr());
+ debugctlmsr |= ds_bts_control(tracer);
+
+ if (task)
+ update_task_debugctlmsr(task, debugctlmsr);
+ else
+ update_debugctlmsr(debugctlmsr);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+
+ task = tracer->ds.context->task;
WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
tracer->ds.context->pebs_master = NULL;
- put_tracer(tracer->ds.context->task);
ds_put_context(tracer->ds.context);
+ put_tracer(task);
kfree(tracer);
}
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+ might_sleep();
+
+ if (!tracer)
+ return;
+
+ ds_suspend_pebs(tracer);
+ ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+ struct task_struct *task;
+ unsigned long irq;
+ int error;
+
+ if (!tracer)
+ return 0;
+
+ task = tracer->ds.context->task;
+
+ local_irq_save(irq);
+
+ error = -EPERM;
+ if (!task &&
+ (tracer->ds.context->cpu != smp_processor_id()))
+ goto out;
+
+ error = -EPERM;
+ if (task && (task != current))
+ goto out;
+
+ ds_suspend_pebs_noirq(tracer);
+ ds_free_pebs(tracer);
+
+ error = 0;
+ out:
+ local_irq_restore(irq);
+ return error;
+}
+
void ds_suspend_pebs(struct pebs_tracer *tracer)
{
}
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
void ds_resume_pebs(struct pebs_tracer *tracer)
{
}
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+ return 0;
+}
+
const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
{
if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
return NULL;
ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- tracer->trace.reset_value =
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+ tracer->trace.counters = ds_cfg.nr_counter_reset;
+ memcpy(tracer->trace.counter_reset,
+ tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+ ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
return &tracer->trace;
}
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
tracer->trace.ds.top = tracer->trace.ds.begin;
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+ ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
(unsigned long)tracer->trace.ds.top);
return 0;
}
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+ unsigned int counter, u64 value)
{
if (!tracer)
return -EINVAL;
- *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+ if (ds_cfg.nr_counter_reset < counter)
+ return -EINVAL;
+
+ *(u64 *)(tracer->ds.context->ds +
+ (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+ (counter * PEBS_RESET_FIELD_SIZE)) = value;
return 0;
}
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
.ctl[dsf_bts] = (1 << 2) | (1 << 3),
.ctl[dsf_bts_kernel] = (1 << 5),
.ctl[dsf_bts_user] = (1 << 6),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_pentium_m = {
.name = "Pentium M",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
-
- .sizeof_field = sizeof(long),
- .sizeof_rec[ds_bts] = sizeof(long) * 3,
-#ifdef __i386__
- .sizeof_rec[ds_pebs] = sizeof(long) * 10,
-#else
- .sizeof_rec[ds_pebs] = sizeof(long) * 18,
-#endif
+ .nr_counter_reset = 1,
};
static const struct ds_configuration ds_cfg_core2_atom = {
.name = "Core 2/Atom",
.ctl[dsf_bts] = (1 << 6) | (1 << 7),
.ctl[dsf_bts_kernel] = (1 << 9),
.ctl[dsf_bts_user] = (1 << 10),
-
- .sizeof_field = 8,
- .sizeof_rec[ds_bts] = 8 * 3,
- .sizeof_rec[ds_pebs] = 8 * 18,
+ .nr_counter_reset = 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+ .name = "Core i7",
+ .ctl[dsf_bts] = (1 << 6) | (1 << 7),
+ .ctl[dsf_bts_kernel] = (1 << 9),
+ .ctl[dsf_bts_user] = (1 << 10),
+ .nr_counter_reset = 4,
};
static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+ struct cpuinfo_x86 *cpu)
{
+ unsigned long nr_pebs_fields = 0;
+
+ printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+ nr_pebs_fields = 10;
+#else
+ nr_pebs_fields = 18;
+#endif
+
+ /*
+ * Starting with version 2, architectural performance
+ * monitoring supports a format specifier.
+ */
+ if ((cpuid_eax(0xa) & 0xff) > 1) {
+ unsigned long perf_capabilities, format;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+ format = (perf_capabilities >> 8) & 0xf;
+
+ switch (format) {
+ case 0:
+ nr_pebs_fields = 18;
+ break;
+ case 1:
+ nr_pebs_fields = 22;
+ break;
+ default:
+ printk(KERN_INFO
+ "[ds] unknown PEBS format: %lu\n", format);
+ nr_pebs_fields = 0;
+ break;
+ }
+ }
+
memset(&ds_cfg, 0, sizeof(ds_cfg));
ds_cfg = *cfg;
- printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+ ds_cfg.sizeof_ptr_field =
+ (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
+
+ ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
+ ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
- if (!cpu_has_bts) {
- ds_cfg.ctl[dsf_bts] = 0;
+ if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+ ds_cfg.sizeof_rec[ds_bts] = 0;
printk(KERN_INFO "[ds] bts not available\n");
}
- if (!cpu_has_pebs)
+ if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
printk(KERN_INFO "[ds] pebs not available\n");
+ }
+
+ printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+ 8 * ds_cfg.sizeof_ptr_field);
+ printk("bts/pebs record: %u/%u bytes\n",
+ ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
- WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+ WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
}
void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
{
+ /* Only configure the first cpu. Others are identical. */
+ if (ds_cfg.name)
+ return;
+
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
case 0x9:
case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_pentium_m, c);
break;
case 0xf:
case 0x17: /* Core2 */
case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom);
+ ds_configure(&ds_cfg_core2_atom, c);
+ break;
+ case 0x1a: /* Core i7 */
+ ds_configure(&ds_cfg_core_i7, c);
break;
- case 0x1a: /* i7 */
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_netburst, c);
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
break;
default:
- /* sorry, don't know about them */
+ /* Sorry, don't know about them. */
break;
}
}
+static inline void ds_take_timestamp(struct ds_context *context,
+ enum bts_qualifier qualifier,
+ struct task_struct *task)
+{
+ struct bts_tracer *tracer = context->bts_master;
+ struct bts_struct ts;
+
+ /* Prevent compilers from reading the tracer pointer twice. */
+ barrier();
+
+ if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+ return;
+
+ memset(&ts, 0, sizeof(ts));
+ ts.qualifier = qualifier;
+ ts.variant.event.clock = trace_clock_global();
+ ts.variant.event.pid = task->pid;
+
+ bts_write(tracer, &ts);
+}
+
/*
* Change the DS configuration from tracing prev to tracing next.
*/
void ds_switch_to(struct task_struct *prev, struct task_struct *next)
{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
+ struct ds_context *prev_ctx = prev->thread.ds_ctx;
+ struct ds_context *next_ctx = next->thread.ds_ctx;
+ unsigned long debugctlmsr = next->thread.debugctlmsr;
+
+ /* Make sure all data is read before we start. */
+ barrier();
if (prev_ctx) {
update_debugctlmsr(0);
- if (prev_ctx->bts_master &&
- (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_departs,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = prev->pid
- };
- bts_write(prev_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(prev_ctx, bts_task_departs, prev);
}
if (next_ctx) {
- if (next_ctx->bts_master &&
- (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
- struct bts_struct ts = {
- .qualifier = bts_task_arrives,
- .variant.timestamp.jiffies = jiffies_64,
- .variant.timestamp.pid = next->pid
- };
- bts_write(next_ctx->bts_master, &ts);
- }
+ ds_take_timestamp(next_ctx, bts_task_arrives, next);
wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
}
- update_debugctlmsr(next->thread.debugctlmsr);
+ update_debugctlmsr(debugctlmsr);
}
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+static __init int ds_selftest(void)
{
- clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
- tsk->thread.ds_ctx = NULL;
-}
+ if (ds_cfg.sizeof_rec[ds_bts]) {
+ int error;
-void ds_exit_thread(struct task_struct *tsk)
-{
+ error = ds_selftest_bts();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling bts.\n");
+ ds_cfg.sizeof_rec[ds_bts] = 0;
+ }
+ }
+
+ if (ds_cfg.sizeof_rec[ds_pebs]) {
+ int error;
+
+ error = ds_selftest_pebs();
+ if (error) {
+ WARN(1, "[ds] selftest failed. disabling pebs.\n");
+ ds_cfg.sizeof_rec[ds_pebs] = 0;
+ }
+ }
+
+ return 0;
}
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 00000000000..6bc7c199ab9
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+ struct bts_tracer *tracer;
+ int error;
+ int (*suspend)(struct bts_tracer *);
+ int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+ int error = 0;
+
+ if (!trace) {
+ printk(KERN_CONT "failed to access trace...");
+ /* Bail out. Other tests are pointless. */
+ return -1;
+ }
+
+ if (!trace->read) {
+ printk(KERN_CONT "bts read not available...");
+ error = -1;
+ }
+
+ /* Do some sanity checks on the trace configuration. */
+ if (!trace->ds.n) {
+ printk(KERN_CONT "empty bts buffer...");
+ error = -1;
+ }
+ if (!trace->ds.size) {
+ printk(KERN_CONT "bad bts trace setup...");
+ error = -1;
+ }
+ if (trace->ds.end !=
+ (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+ printk(KERN_CONT "bad bts buffer setup...");
+ error = -1;
+ }
+ /*
+ * We allow top in [begin; end], since its not clear when the
+ * overflow adjustment happens: after the increment or before the
+ * write.
+ */
+ if ((trace->ds.top < trace->ds.begin) ||
+ (trace->ds.end < trace->ds.top)) {
+ printk(KERN_CONT "bts top out of bounds...");
+ error = -1;
+ }
+
+ return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+ const struct bts_trace *trace,
+ const void *from, const void *to)
+{
+ const unsigned char *at;
+
+ /*
+ * Check a few things which do not belong to this test.
+ * They should be covered by other tests.
+ */
+ if (!trace)
+ return -1;
+
+ if (!trace->read)
+ return -1;
+
+ if (to < from)
+ return -1;
+
+ if (from < trace->ds.begin)
+ return -1;
+
+ if (trace->ds.end < to)
+ return -1;
+
+ if (!trace->ds.size)
+ return -1;
+
+ /* Now to the test itself. */
+ for (at = from; (void *)at < to; at += trace->ds.size) {
+ struct bts_struct bts;
+ unsigned long index;
+ int error;
+
+ if (((void *)at - trace->ds.begin) % trace->ds.size) {
+ printk(KERN_CONT
+ "read from non-integer index...");
+ return -1;
+ }
+ index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+ memset(&bts, 0, sizeof(bts));
+ error = trace->read(tracer, at, &bts);
+ if (error < 0) {
+ printk(KERN_CONT
+ "error reading bts trace at [%lu] (0x%p)...",
+ index, at);
+ return error;
+ }
+
+ switch (bts.qualifier) {
+ case BTS_BRANCH:
+ break;
+ default:
+ printk(KERN_CONT
+ "unexpected bts entry %llu at [%lu] (0x%p)...",
+ bts.qualifier, index, at);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+ struct ds_selftest_bts_conf *conf = arg;
+ const struct bts_trace *trace;
+ void *top;
+
+ if (IS_ERR(conf->tracer)) {
+ conf->error = PTR_ERR(conf->tracer);
+ conf->tracer = NULL;
+
+ printk(KERN_CONT
+ "initialization failed (err: %d)...", conf->error);
+ return;
+ }
+
+ /* We should meanwhile have enough trace. */
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ /* Let's see if we can access the trace. */
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ /* If everything went well, we should have a few trace entries. */
+ if (trace->ds.top == trace->ds.begin) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning, but continue.
+ */
+ printk(KERN_CONT "no trace/overflow...");
+ }
+
+ /* Let's try to read the trace we collected. */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.top);
+ if (conf->error < 0)
+ return;
+
+ /*
+ * Let's read the trace again.
+ * Since we suspended tracing, we should get the same result.
+ */
+ top = trace->ds.top;
+
+ trace = ds_read_bts(conf->tracer);
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (top != trace->ds.top) {
+ printk(KERN_CONT "suspend not working...");
+ conf->error = -1;
+ return;
+ }
+
+ /* Let's collect some more trace - see if resume is working. */
+ conf->error = conf->resume(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ conf->error = conf->suspend(conf->tracer);
+ if (conf->error < 0)
+ return;
+
+ trace = ds_read_bts(conf->tracer);
+
+ conf->error = ds_selftest_bts_consistency(trace);
+ if (conf->error < 0)
+ return;
+
+ if (trace->ds.top == top) {
+ /*
+ * It is possible but highly unlikely that we got a
+ * buffer overflow and end up at exactly the same
+ * position we started from.
+ * Let's issue a warning and check the full trace.
+ */
+ printk(KERN_CONT
+ "no resume progress/overflow...");
+
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else if (trace->ds.top < top) {
+ /*
+ * We had a buffer overflow - the entire buffer should
+ * contain trace records.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace,
+ trace->ds.begin, trace->ds.end);
+ } else {
+ /*
+ * It is quite likely that the buffer did not overflow.
+ * Let's just check the delta trace.
+ */
+ conf->error =
+ ds_selftest_bts_read(conf->tracer, trace, top,
+ trace->ds.top);
+ }
+ if (conf->error < 0)
+ return;
+
+ conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_suspend_bts(tracer);
+ return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+ ds_resume_bts(tracer);
+ return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+ (void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+ struct bts_tracer *tracer)
+{
+ int error = -EPERM;
+
+ /* Try to release the tracer on the wrong cpu. */
+ get_cpu();
+ if (cpu != smp_processor_id()) {
+ error = ds_release_bts_noirq(tracer);
+ if (error != -EPERM)
+ printk(KERN_CONT "release on wrong cpu...");
+ }
+ put_cpu();
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ ds_release_bts(tracer);
+ error = 0;
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "cpu/task tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+ struct bts_tracer *tracer;
+ int error;
+
+ /* Try to request cpu tracing while task tracing is active. */
+ tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+ (size_t)-1, BTS_KERNEL);
+ error = PTR_ERR(tracer);
+ if (!IS_ERR(tracer)) {
+ error = 0;
+ ds_release_bts(tracer);
+ }
+
+ if (error != -EPERM)
+ printk(KERN_CONT "task/cpu tracing overlap...");
+
+ return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+ struct ds_selftest_bts_conf conf;
+ unsigned char buffer[BUFFER_SIZE], *small_buffer;
+ unsigned long irq;
+ int cpu;
+
+ printk(KERN_INFO "[ds] bts selftest...");
+ conf.error = 0;
+
+ small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+ if (conf.error >= 0) {
+ conf.error =
+ ds_selftest_bts_bad_release_noirq(cpu,
+ conf.tracer);
+ /* We must not release the tracer twice. */
+ if (conf.error < 0)
+ conf.tracer = NULL;
+ }
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_task(buffer);
+ smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+ conf.tracer, 1);
+ if (conf.error < 0)
+ goto out;
+ }
+
+ conf.suspend = ds_suspend_bts_wrap;
+ conf.resume = ds_resume_bts_wrap;
+ conf.tracer =
+ ds_request_bts_task(current, buffer, BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts(conf.tracer);
+ if (conf.error < 0)
+ goto out;
+
+ conf.suspend = ds_suspend_bts_noirq;
+ conf.resume = ds_resume_bts_noirq;
+ conf.tracer =
+ ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+ NULL, (size_t)-1, BTS_KERNEL);
+ local_irq_save(irq);
+ ds_selftest_bts_cpu(&conf);
+ if (conf.error >= 0)
+ conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+ ds_release_bts_noirq(conf.tracer);
+ local_irq_restore(irq);
+ if (conf.error < 0)
+ goto out;
+
+ conf.error = 0;
+ out:
+ put_online_cpus();
+ printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+ return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+ return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 00000000000..2ba8745c666
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b869..81086c227ab 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *sp, unsigned long bp, char *log_lvl);
extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
/* The form of the top of the frame on the stack */
struct stack_frame {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 00628130292..7271fa33d79 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
*/
__init void e820_setup_gap(void)
{
- unsigned long gapstart, gapsize, round;
+ unsigned long gapstart, gapsize;
int found;
gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
#endif
/*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
+ * e820_reserve_resources_late protect stolen RAM already
*/
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
+ pci_mem_start = gapstart;
printk(KERN_INFO
"Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
}
}
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+ unsigned long mb = pos >> 20;
+
+ /* To 64kB in the first megabyte */
+ if (!mb)
+ return 64*1024;
+
+ /* To 1MB in the first 16MB */
+ if (mb < 16)
+ return 1024*1024;
+
+ /* To 32MB for anything above that */
+ return 32*1024*1024;
+}
+
void __init e820_reserve_resources_late(void)
{
int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
}
+
+ /*
+ * Try to bump up RAM regions to reasonable boundaries to
+ * avoid stolen RAM:
+ */
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *entry = &e820_saved.map[i];
+ resource_size_t start, end;
+
+ if (entry->type != E820_RAM)
+ continue;
+ start = entry->addr + entry->size;
+ end = round_up(start, ram_alignment(start));
+ if (start == end)
+ continue;
+ reserve_region_with_split(&iomem_resource, start,
+ end - 1, "RAM buffer");
+ }
}
char *__init default_machine_specific_memory_setup(void)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953de..ebdb85cf268 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,6 +97,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
{
u32 d;
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
d &= 0xff;
return d;
}
+#endif
static void __init ati_bugs(int num, int slot, int func)
{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e843..a4742a340d8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
GLOBAL(return_to_handler)
subq $80, %rsp
+ /* Save the return values */
movq %rax, (%rsp)
- movq %rcx, 8(%rsp)
- movq %rdx, 16(%rsp)
- movq %rsi, 24(%rsp)
- movq %rdi, 32(%rsp)
- movq %r8, 40(%rsp)
- movq %r9, 48(%rsp)
- movq %r10, 56(%rsp)
- movq %r11, 64(%rsp)
+ movq %rdx, 8(%rsp)
call ftrace_return_to_handler
movq %rax, 72(%rsp)
- movq 64(%rsp), %r11
- movq 56(%rsp), %r10
- movq 48(%rsp), %r9
- movq 40(%rsp), %r8
- movq 32(%rsp), %rdi
- movq 24(%rsp), %rsi
- movq 16(%rsp), %rdx
- movq 8(%rsp), %rcx
+ movq 8(%rsp), %rdx
movq (%rsp), %rax
addq $72, %rsp
retq
@@ -1025,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \
apicinterrupt SPURIOUS_APIC_VECTOR \
spurious_interrupt smp_spurious_interrupt
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PENDING_VECTOR \
+ perf_pending_interrupt smp_perf_pending_interrupt
+#endif
+
/*
* Exception entry points.
*/
@@ -1379,6 +1371,11 @@ END(xen_failsafe_callback)
paranoidzeroentry_ist debug do_debug DEBUG_STACK
paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
#ifdef CONFIG_X86_MCE
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0c..dc5ed4bdd88 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
ENTRY(initial_code)
.long i386_start_kernel
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
/*
* BSS section
*/
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c..38287b5f116 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/hw_irq.h>
atomic_t irq_err_count;
@@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL;
*/
void ack_bad_irq(unsigned int irq)
{
- printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+ if (printk_ratelimit())
+ pr_err("unexpected IRQ trap at vector %02x\n", irq);
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Currently unexpected vectors happen only on SMP and APIC.
* We _must_ ack these because every local APIC has only N
@@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq)
* completely.
* But only ack when the APIC is enabled -AK
*/
- if (cpu_has_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
}
#define irq_stats(x) (&per_cpu(irq_stat, x))
@@ -63,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+ seq_printf(p, "%*s: ", prec, "CNT");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ seq_printf(p, " Performance counter interrupts\n");
+ seq_printf(p, "%*s: ", prec, "PND");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+ seq_printf(p, " Performance pending work\n");
#endif
if (generic_interrupt_extension) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -166,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_X86_LOCAL_APIC
sum += irq_stats(cpu)->apic_timer_irqs;
sum += irq_stats(cpu)->irq_spurious_count;
+ sum += irq_stats(cpu)->apic_perf_irqs;
+ sum += irq_stats(cpu)->apic_pending_irqs;
#endif
if (generic_interrupt_extension)
sum += irq_stats(cpu)->generic_irqs;
@@ -178,7 +187,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_thermal_count;
# ifdef CONFIG_X86_64
sum += irq_stats(cpu)->irq_threshold_count;
-#endif
+# endif
#endif
return sum;
}
@@ -213,14 +222,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
irq = __get_cpu_var(vector_irq)[vector];
if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
- if (!disable_apic)
- ack_APIC_irq();
-#endif
+ ack_APIC_irq();
if (printk_ratelimit())
- printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(), vector, irq);
}
irq_exit();
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
index 368b0a8836f..267c6624c77 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit.c
@@ -1,20 +1,25 @@
+#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
+#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/timer.h>
+#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
@@ -22,7 +27,23 @@
#include <asm/i8259.h>
#include <asm/traps.h>
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+#ifdef CONFIG_X86_32
/*
* Note that on a 486, we don't want to do a SIGFPE on an irq13
* as the irq is unreliable, and exception 16 works correctly
@@ -52,30 +73,7 @@ static struct irqaction fpu_irq = {
.handler = math_error_irq,
.name = "fpu",
};
-
-void __init init_ISA_irqs(void)
-{
- int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_bsp_APIC();
#endif
- init_8259A(0);
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
/*
* IRQ2 is cascade interrupt to second interrupt controller
@@ -118,29 +116,37 @@ int vector_used_by_percpu_irq(unsigned int vector)
return 0;
}
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init init_ISA_irqs(void)
{
int i;
- /* Execute any quirks before the call gates are initialised: */
- x86_quirk_pre_intr_init();
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
/*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
+ * 16 old-style INTA-cycle interrupts:
*/
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ struct irq_desc *desc = irq_to_desc(i);
+
+ desc->status = IRQ_DISABLED;
+ desc->action = NULL;
+ desc->depth = 1;
+
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
}
+}
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -160,16 +166,32 @@ void __init native_init_IRQ(void)
/* IPI for generic function call */
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
- /* IPI for single call function */
+ /* IPI for generic single function call */
alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
+ call_function_single_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+ smp_intr_init();
+
+#ifdef CONFIG_X86_THERMAL_VECTOR
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#ifdef CONFIG_X86_THRESHOLD
+ alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC)
+ alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
+#endif
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -179,16 +201,59 @@ void __init native_init_IRQ(void)
/* IPI vectors for APIC spurious and error interrupts */
alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ /* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+ alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
#endif
+}
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
- /* thermal monitor LVT interrupt */
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ * Perform any necessary interrupt initialisation prior to setting up
+ * the "ordinary" interrupt call gates. For legacy reasons, the ISA
+ * interrupts should be initialised here if the machine emulates a PC
+ * in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+#ifdef CONFIG_X86_32
+ if (x86_quirks->arch_pre_intr_init) {
+ if (x86_quirks->arch_pre_intr_init())
+ return;
+ }
#endif
+ init_ISA_irqs();
+}
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* Execute any quirks before the call gates are initialised: */
+ x86_quirk_pre_intr_init();
+
+ apic_intr_init();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+ if (!test_bit(i, used_vectors))
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+ }
if (!acpi_ioapic)
setup_irq(2, &irq2);
+#ifdef CONFIG_X86_32
/*
* Call quirks after call gates are initialised (usually add in
* the architecture specific gates):
@@ -203,4 +268,5 @@ void __init native_init_IRQ(void)
setup_irq(FPU_IRQ, &fpu_irq);
irq_ctx_init(smp_processor_id());
+#endif
}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index 8cd10537fd4..00000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
- .handler = no_action,
- .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
- int cpu;
-
- for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
- return 1;
- }
-
- return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
- int i;
-
- init_bsp_APIC();
- init_8259A(0);
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- struct irq_desc *desc = irq_to_desc(i);
-
- desc->status = IRQ_DISABLED;
- desc->action = NULL;
- desc->depth = 1;
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* IPI for generic single function call */
- alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
- call_function_single_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
- set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-}
-
-static void __init apic_intr_init(void)
-{
- smp_intr_init();
-
- alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
- alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
- /* self generated IPI for local APIC timer */
- alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* generic IPI for platform specific use */
- alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-}
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
- apic_intr_init();
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b1f4dffb919..8d82a77a3f3 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = p->thread.ip;
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b..6551dedee20 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -195,7 +195,7 @@ static void kvm_leave_lazy_mmu(void)
struct kvm_para_state *state = kvm_para_state();
mmu_queue_flush(state);
- paravirt_leave_lazy(paravirt_get_lazy_mode());
+ paravirt_leave_lazy_mmu();
state->mode = paravirt_get_lazy_mode();
}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c..366baa17991 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
#include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
#define UCODE_CONTAINER_SECTION_HDR 8
#define UCODE_CONTAINER_HEADER_SIZE 12
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static struct equiv_cpu_entry *equiv_cpu_table;
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
return 1;
}
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
- unsigned long flags;
u32 rev, dummy;
int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_amd == NULL)
- return;
+ return 0;
- spin_lock_irqsave(&microcode_update_lock, flags);
wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* get patch id after patching */
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
/* check current patch id and patch's id for match */
if (rev != mc_amd->hdr.patch_id) {
printk(KERN_ERR "microcode: CPU%d: update failed "
"(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
- return;
+ return -1;
}
printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
cpu, rev);
uci->cpu_sig.rev = rev;
+
+ return 0;
}
static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
static void free_equiv_cpu_table(void)
{
- if (equiv_cpu_table) {
- vfree(equiv_cpu_table);
- equiv_cpu_table = NULL;
- }
+ vfree(equiv_cpu_table);
+ equiv_cpu_table = NULL;
}
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
int new_rev = uci->cpu_sig.rev;
unsigned int leftover;
unsigned long offset;
+ enum ucode_state state = UCODE_OK;
offset = install_equiv_cpu_table(ucode_ptr);
if (!offset) {
printk(KERN_ERR "microcode: failed to create "
"equivalent cpu table\n");
- return -EINVAL;
+ return UCODE_ERROR;
}
ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
mc_header = (struct microcode_header_amd *)mc;
if (get_matching_microcode(cpu, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
+ vfree(new_mc);
new_rev = mc_header->patch_id;
new_mc = mc;
} else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
if (new_mc) {
if (!leftover) {
- if (uci->mc)
- vfree(uci->mc);
+ vfree(uci->mc);
uci->mc = new_mc;
pr_debug("microcode: CPU%d found a matching microcode "
"update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
- } else
+ } else {
vfree(new_mc);
- }
+ state = UCODE_ERROR;
+ }
+ } else
+ state = UCODE_NFOUND;
free_equiv_cpu_table();
- return (int)leftover;
+ return state;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
const char *fw_name = "amd-ucode/microcode_amd.bin";
const struct firmware *firmware;
- int ret;
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
+ enum ucode_state ret;
- ret = request_firmware(&firmware, fw_name, device);
- if (ret) {
+ if (request_firmware(&firmware, fw_name, device)) {
printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
return ret;
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
printk(KERN_INFO "microcode: AMD microcode update via "
"/dev/cpu/microcode not supported\n");
- return -1;
+ return UCODE_ERROR;
}
static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d..9c4461501fc 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
* Thanks to Stuart Swales for pointing out this bug.
*/
#include <linux/platform_device.h>
-#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <asm/microcode.h>
#include <asm/processor.h>
-#include <asm/msr.h>
MODULE_DESCRIPTION("Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ * the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
EXPORT_SYMBOL_GPL(ucode_cpu_info);
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+ struct cpu_signature *cpu_sig;
+ int err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+ struct cpu_info_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+ ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+ struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int ret;
+
+ memset(uci, 0, sizeof(*uci));
+
+ ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+ if (!ret)
+ uci->valid = 1;
+
+ return ret;
+}
+
+struct apply_microcode_ctx {
+ int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+ struct apply_microcode_ctx *ctx = arg;
+
+ ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+ struct apply_microcode_ctx ctx = { .err = 0 };
+ int ret;
+
+ ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+ if (!ret)
+ ret = ctx.err;
+
+ return ret;
+}
+
#ifdef CONFIG_MICROCODE_OLD_INTERFACE
static int do_microcode_update(const void __user *buf, size_t size)
{
- cpumask_t old;
int error = 0;
int cpu;
- old = current->cpus_allowed;
-
for_each_online_cpu(cpu) {
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
if (!uci->valid)
continue;
- set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- error = microcode_ops->request_microcode_user(cpu, buf, size);
- if (error < 0)
- goto out;
- if (!error)
- microcode_ops->apply_microcode(cpu);
+ ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+ if (ustate == UCODE_ERROR) {
+ error = -1;
+ break;
+ } else if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
}
-out:
- set_cpus_allowed_ptr(current, &old);
+
return error;
}
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
static ssize_t microcode_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
- ssize_t ret;
+ ssize_t ret = -EINVAL;
if ((len >> PAGE_SHIFT) > num_physpages) {
- printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
- num_physpages);
- return -EINVAL;
+ pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+ return ret;
}
get_online_cpus();
mutex_lock(&microcode_mutex);
- ret = do_microcode_update(buf, len);
- if (!ret)
+ if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
}
static const struct file_operations microcode_fops = {
- .owner = THIS_MODULE,
- .write = microcode_write,
- .open = microcode_open,
+ .owner = THIS_MODULE,
+ .write = microcode_write,
+ .open = microcode_open,
};
static struct miscdevice microcode_dev = {
- .minor = MICROCODE_MINOR,
- .name = "microcode",
- .fops = &microcode_fops,
+ .minor = MICROCODE_MINOR,
+ .name = "microcode",
+ .fops = &microcode_fops,
};
static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
error = misc_register(&microcode_dev);
if (error) {
- printk(KERN_ERR
- "microcode: can't misc_register on minor=%d\n",
- MICROCODE_MINOR);
+ pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
return error;
}
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
/* fake device for request_firmware */
static struct platform_device *microcode_pdev;
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
int err = 0;
mutex_lock(&microcode_mutex);
if (uci->valid) {
- err = microcode_ops->request_microcode_fw(smp_processor_id(),
- &microcode_pdev->dev);
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
+ enum ucode_state ustate;
+
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
}
mutex_unlock(&microcode_mutex);
+
return err;
}
static ssize_t reload_store(struct sys_device *dev,
struct sysdev_attribute *attr,
- const char *buf, size_t sz)
+ const char *buf, size_t size)
{
- char *end;
- unsigned long val = simple_strtoul(buf, &end, 0);
- int err = 0;
+ unsigned long val;
int cpu = dev->id;
+ int ret = 0;
+ char *end;
+ val = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
+
if (val == 1) {
get_online_cpus();
if (cpu_online(cpu))
- err = work_on_cpu(cpu, reload_for_cpu, NULL);
+ ret = reload_for_cpu(cpu);
put_online_cpus();
}
- if (err)
- return err;
- return sz;
+
+ if (!ret)
+ ret = size;
+
+ return ret;
}
static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
};
static struct attribute_group mc_attr_group = {
- .attrs = mc_default_attrs,
- .name = "microcode",
+ .attrs = mc_default_attrs,
+ .name = "microcode",
};
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
uci->valid = 0;
}
-static void microcode_fini_cpu(int cpu)
-{
- mutex_lock(&microcode_mutex);
- __microcode_fini_cpu(cpu);
- mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- memset(uci, 0, sizeof(*uci));
- if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
- uci->valid = 1;
+ if (!uci->mc)
+ return UCODE_NFOUND;
+
+ pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+ apply_microcode_on_target(cpu);
+
+ return UCODE_OK;
}
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct cpu_signature nsig;
+ enum ucode_state ustate;
- pr_debug("microcode: CPU%d resumed\n", cpu);
+ if (collect_cpu_info(cpu))
+ return UCODE_ERROR;
- if (!uci->mc)
- return 1;
+ /* --dimm. Trigger a delayed update? */
+ if (system_state != SYSTEM_RUNNING)
+ return UCODE_NFOUND;
- /*
- * Let's verify that the 'cached' ucode does belong
- * to this cpu (a bit of paranoia):
- */
- if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
- cpu);
- return -1;
- }
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
- __microcode_fini_cpu(cpu);
- printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
- cpu);
- /* Should we look for a new ucode here? */
- return 1;
+ if (ustate == UCODE_OK) {
+ pr_debug("microcode: CPU%d updated upon init\n", cpu);
+ apply_microcode_on_target(cpu);
}
- return 0;
+ return ustate;
}
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
- int err = 0;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
- /*
- * Check if the system resume is in progress (uci->valid != NULL),
- * otherwise just request a firmware:
- */
- if (uci->valid) {
- err = microcode_resume_cpu(smp_processor_id());
- } else {
- collect_cpu_info(smp_processor_id());
- if (uci->valid && system_state == SYSTEM_RUNNING)
- err = microcode_ops->request_microcode_fw(
- smp_processor_id(),
- &microcode_pdev->dev);
- }
- if (!err)
- microcode_ops->apply_microcode(smp_processor_id());
- return err;
-}
+ if (uci->valid)
+ ustate = microcode_resume_cpu(cpu);
+ else
+ ustate = microcode_init_cpu(cpu);
-static int microcode_init_cpu(int cpu)
-{
- int err;
- mutex_lock(&microcode_mutex);
- err = work_on_cpu(cpu, microcode_update_cpu, NULL);
- mutex_unlock(&microcode_mutex);
-
- return err;
+ return ustate;
}
static int mc_sysdev_add(struct sys_device *sys_dev)
{
int err, cpu = sys_dev->id;
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
pr_debug("microcode: CPU%d added\n", cpu);
- memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
if (err)
return err;
- err = microcode_init_cpu(cpu);
+ if (microcode_init_cpu(cpu) == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
static int mc_sysdev_resume(struct sys_device *dev)
{
int cpu = dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
if (!cpu_online(cpu))
return 0;
- /* only CPU 0 will apply ucode here */
- microcode_update_cpu(NULL);
+ /*
+ * All non-bootup cpus are still disabled,
+ * so only CPU 0 will apply ucode here.
+ *
+ * Moreover, there can be no concurrent
+ * updates from any other places at this point.
+ */
+ WARN_ON(cpu != 0);
+
+ if (uci->valid && uci->mc)
+ microcode_ops->apply_microcode(cpu);
+
return 0;
}
static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
- .resume = mc_sysdev_resume,
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
};
static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- if (microcode_init_cpu(cpu))
- printk(KERN_ERR "microcode: failed to init CPU%d\n",
- cpu);
+ microcode_update_cpu(cpu);
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("microcode: CPU%d added\n", cpu);
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- printk(KERN_ERR "microcode: Failed to create the sysfs "
- "group for CPU%d\n", cpu);
+ pr_err("microcode: Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
microcode_ops = init_amd_microcode();
if (!microcode_ops) {
- printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+ pr_err("microcode: no support for this CPU vendor\n");
return -ENODEV;
}
- error = microcode_dev_init();
- if (error)
- return error;
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
}
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
+
if (error) {
- microcode_dev_exit();
platform_device_unregister(microcode_pdev);
return error;
}
+ error = microcode_dev_init();
+ if (error)
+ return error;
+
register_hotcpu_notifier(&mc_cpu_notifier);
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>,"
" Peter Oruba\n");
return 0;
}
+module_init(microcode_init);
static void __exit microcode_exit(void)
{
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
unregister_hotcpu_notifier(&mc_cpu_notifier);
get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+ mutex_unlock(&microcode_mutex);
put_online_cpus();
platform_device_unregister(microcode_pdev);
microcode_ops = NULL;
- printk(KERN_INFO
- "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+ pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
-
-module_init(microcode_init);
module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1a..0d334ddd0a9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
* Fix sigmatch() macro to handle old CPUs with pf == 0.
* Thanks to Stuart Swales for pointing out this bug.
*/
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
#include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/microcode.h>
#include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu_num);
- unsigned long flags;
unsigned int val[2];
memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
csig->pf = 1 << ((val[1] >> 18) & 7);
}
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
-
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
/* see notes above for revision 1.07. Apparent chip bug */
sync_core();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
- pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
- csig->sig, csig->pf, csig->rev);
+ printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+ cpu_num, csig->sig, csig->pf, csig->rev);
return 0;
}
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
return 0;
}
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
- unsigned long flags;
unsigned int val[2];
int cpu_num;
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
BUG_ON(cpu_num != cpu);
if (mc_intel == NULL)
- return;
-
- /* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ return 0;
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- spin_unlock_irqrestore(&microcode_update_lock, flags);
if (val[1] != mc_intel->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d update from revision "
- "0x%x to 0x%x failed\n",
- cpu_num, uci->cpu_sig.rev, val[1]);
- return;
+ printk(KERN_ERR "microcode: CPU%d update "
+ "to revision 0x%x failed\n",
+ cpu_num, mc_intel->hdr.rev);
+ return -1;
}
- printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %04x-%02x-%02x \n",
- cpu_num, uci->cpu_sig.rev, val[1],
+ printk(KERN_INFO "microcode: CPU%d updated to revision "
+ "0x%x, date = %04x-%02x-%02x \n",
+ cpu_num, val[1],
mc_intel->hdr.date & 0xffff,
mc_intel->hdr.date >> 24,
(mc_intel->hdr.date >> 16) & 0xff);
uci->cpu_sig.rev = val[1];
+
+ return 0;
}
-static int generic_load_microcode(int cpu, void *data, size_t size,
- int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+ int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
u8 *ucode_ptr = data, *new_mc = NULL, *mc;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
+ enum ucode_state state = UCODE_OK;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
leftover -= mc_size;
}
- if (!new_mc)
+ if (leftover) {
+ if (new_mc)
+ vfree(new_mc);
+ state = UCODE_ERROR;
goto out;
+ }
- if (leftover) {
- vfree(new_mc);
+ if (!new_mc) {
+ state = UCODE_NFOUND;
goto out;
}
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
pr_debug("microcode: CPU%d found a matching microcode update with"
" version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-
- out:
- return (int)leftover;
+out:
+ return state;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
const struct firmware *firmware;
- int ret;
+ enum ucode_state ret;
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- ret = request_firmware(&firmware, name, device);
- if (ret) {
+
+ if (request_firmware(&firmware, name, device)) {
pr_debug("microcode: data file %s load failed\n", name);
- return ret;
+ return UCODE_NFOUND;
}
ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
return copy_from_user(to, from, n);
}
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
{
- /* We should bind the task to the CPU */
- BUG_ON(cpu != raw_smp_processor_id());
-
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c1..651c93b2886 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/smp.h>
+#include <linux/pci.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
#endif /* CONFIG_X86_IO_APIC */
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
- int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
{
- if (!mpc_new_phys) {
- pr_info("No spare slots, try to append...take your risk, "
- "new mpc_length %x\n", count);
- } else {
- if (count <= mpc_new_length)
- pr_info("No spare slots, try to append..., "
- "new mpc_length %x\n", count);
- else {
- pr_err("mpc_new_length %lx is too small\n",
- mpc_new_length);
- return -1;
- }
+ int ret = 0;
+
+ if (!mpc_new_phys || count <= mpc_new_length) {
+ WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+ return -1;
}
- return 0;
+ return ret;
}
static int __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
- if (!check_slot(mpc_new_phys, mpc_new_length, count))
+ if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
assign_to_mpc_intsrc(&mp_irqs[i], m);
mpc->length = count;
@@ -963,11 +957,14 @@ out:
return 0;
}
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
static int __init update_mptable_setup(char *str)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
return 0;
}
early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
static int __init parse_alloc_mptable_opt(char *p)
{
enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+ pci_routeirq = 1;
+#endif
alloc_mptable = 1;
if (!p)
return 0;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 9faf43bea33..70ec9b951d7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
static inline void enter_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
- __get_cpu_var(paravirt_lazy_mode) = mode;
+ percpu_write(paravirt_lazy_mode, mode);
}
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
{
- BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
- BUG_ON(preemptible());
+ BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
- __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+ percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
}
void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
void paravirt_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+ leave_lazy(PARAVIRT_LAZY_MMU);
}
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
{
+ BUG_ON(preemptible());
+
+ if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
enter_lazy(PARAVIRT_LAZY_CPU);
}
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+ BUG_ON(preemptible());
+
+ leave_lazy(PARAVIRT_LAZY_CPU);
+
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
{
- return __get_cpu_var(paravirt_lazy_mode);
+ if (in_interrupt())
+ return PARAVIRT_LAZY_NONE;
+
+ return percpu_read(paravirt_lazy_mode);
}
void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_disable();
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
- WARN_ON(preempt_count() == 1);
arch_leave_lazy_mmu_mode();
arch_enter_lazy_mmu_mode();
}
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
preempt_enable();
}
-void arch_flush_lazy_cpu_mode(void)
-{
- preempt_disable();
-
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
- WARN_ON(preempt_count() == 1);
- arch_leave_lazy_cpu_mode();
- arch_enter_lazy_cpu_mode();
- }
-
- preempt_enable();
-}
-
struct pv_info pv_info = {
.name = "bare hardware",
.paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
.set_iopl_mask = native_set_iopl_mask,
.io_delay = native_io_delay,
- .lazy_mode = {
- .enter = paravirt_nop,
- .leave = paravirt_nop,
- },
+ .start_context_switch = paravirt_nop,
+ .end_context_switch = paravirt_nop,
};
struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f..971a3bec47a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- unsigned long idx = start;
-
- BUG_ON(start >= end);
-
- while (idx < end) {
- if (!!test_bit(idx, bitmap) != expected)
- return idx;
- ++idx;
- }
-
- /* all bits have the expected value */
- return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
- int expected, unsigned long start, unsigned long end)
-{
- return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
static inline int translation_enabled(struct iommu_table *tbl)
{
/* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
{
unsigned long index;
unsigned long end;
- unsigned long badbit;
unsigned long flags;
index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 0, index, end);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: entry already allocated at "
- "0x%lx tbl %p dma 0x%lx npages %u\n",
- badbit, tbl, start_addr, npages);
- }
-
iommu_area_reserve(tbl->it_map, index, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned int npages)
{
unsigned long entry;
- unsigned long badbit;
unsigned long badend;
unsigned long flags;
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
spin_lock_irqsave(&tbl->it_lock, flags);
- badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
- if (badbit != ~0UL) {
- if (printk_ratelimit())
- printk(KERN_ERR "Calgary: bit is off at 0x%lx "
- "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
- badbit, tbl, dma_addr, entry, npages);
- }
-
iommu_area_free(tbl->it_map, entry, npages);
spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
iommu_detected = 1;
calgary_detected = 1;
printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
- printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
- "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
- debugging ? "enabled" : "disabled");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+ specified_table_size);
/* swiotlb for devices that aren't behind the Calgary. */
if (max_pfn > MAX_DMA32_PFN)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035..cfd9f906389 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
}
#ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = __builtin_return_address(0);\
- } while (0)
-
-#define CLEAR_LEAK(x) \
- do { \
- if (iommu_leak_tab) \
- iommu_leak_tab[x] = NULL; \
- } while (0)
-
/* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
static int leak_trace;
static int iommu_leak_pages = 20;
static void dump_leak(void)
{
- int i;
static int dump;
- if (dump || !iommu_leak_tab)
+ if (dump)
return;
dump = 1;
- show_stack(NULL, NULL);
- /* Very crude. dump some from the end of the table too */
- printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
- iommu_leak_pages);
- for (i = 0; i < iommu_leak_pages; i += 2) {
- printk(KERN_DEBUG "%lu: ", iommu_pages-i);
- printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
- 0);
- printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
- }
- printk(KERN_DEBUG "\n");
+ show_stack(NULL, NULL);
+ debug_dma_dump_mappings(NULL);
}
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
#endif
static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
- SET_LEAK(iommu_page + i);
phys_mem += PAGE_SIZE;
}
return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
for (i = 0; i < npages; i++) {
iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
- CLEAR_LEAK(iommu_page + i);
}
free_iommu(iommu_page, npages);
}
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
while (pages--) {
iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
- SET_LEAK(iommu_page);
addr += PAGE_SIZE;
iommu_page++;
}
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
agp_gatt_table = gatt;
- enable_gart_translations();
-
error = sysdev_class_register(&gart_sysdev_class);
if (!error)
error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
#ifdef CONFIG_IOMMU_LEAK
if (leak_trace) {
- iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
- get_order(iommu_pages*sizeof(void *)));
- if (!iommu_leak_tab)
+ int ret;
+
+ ret = dma_debug_resize_entries(iommu_pages);
+ if (ret)
printk(KERN_DEBUG
- "PCI-DMA: Cannot allocate leak trace area\n");
+ "PCI-DMA: Cannot trace all the entries\n");
}
#endif
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
* the pages as Not-Present:
*/
wbinvd();
+
+ /*
+ * Now all caches are flushed and we can safely enable
+ * GART hardware. Doing it early leaves the possibility
+ * of stale cache entries that can lead to GART PTE
+ * errors.
+ */
+ enable_gart_translations();
/*
* Try to workaround a bug (thanks to BenH):
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 221a3853e26..a1712f2b50f 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
return paddr;
}
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
{
return baddr;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e84..3bb2be1649b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,12 +8,15 @@
#include <linux/module.h>
#include <linux/pm.h>
#include <linux/clockchips.h>
+#include <linux/random.h>
#include <trace/power.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
+#include <asm/ds.h>
unsigned long idle_halt;
EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
tsk->thread.xstate = NULL;
}
+
+ WARN(tsk->thread.ds_ctx, "leaking DS context\n");
}
void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
put_cpu();
kfree(bp);
}
-
- ds_exit_thread(current);
}
void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
}
early_param("idle", idle_setup);
+unsigned long arch_align_stack(unsigned long sp)
+{
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ sp -= get_random_int() % 8192;
+ return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ unsigned long range_end = mm->brk + 0x02000000;
+ return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a..59f4524984a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -33,7 +31,6 @@
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.io_bitmap_max = 0;
}
- ds_copy_thread(p, current);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b..ebefb5407b9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
* This file handles the architecture-dependent parts of process handling..
*/
-#include <stdarg.h>
-
#include <linux/stackprotector.h>
#include <linux/cpu.h>
#include <linux/errno.h>
@@ -32,7 +30,6 @@
#include <linux/delay.h>
#include <linux/module.h>
#include <linux/ptrace.h>
-#include <linux/random.h>
#include <linux/notifier.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
goto out;
}
- ds_copy_thread(p, me);
+ clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+ p->thread.ds_ctx = NULL;
clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
* done before math_state_restore, so the TS bit is up
* to date.
*/
- arch_leave_lazy_cpu_mode();
+ arch_end_context_switch(next_p);
/*
* Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
return do_arch_prctl(current, code, addr);
}
-unsigned long arch_align_stack(unsigned long sp)
-{
- if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
- sp -= get_random_int() % 8192;
- return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 23b7c8f017e..09ecbde91c1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
+#include <linux/workqueue.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
}
#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+ /* The branch trace handle. */
+ struct bts_tracer *tracer;
+
+ /* The buffer used to store the branch trace and its size. */
+ void *buffer;
+ unsigned int size;
+
+ /* The mm that paid for the above buffer. */
+ struct mm_struct *mm;
+
+ /* The task this context belongs to. */
+ struct task_struct *task;
+
+ /* The signal to send on a bts buffer overflow. */
+ unsigned int bts_ovfl_signal;
+
+ /* The work struct to destroy a context. */
+ struct work_struct work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+ void *buffer = NULL;
+ int err = -ENOMEM;
+
+ err = account_locked_memory(current->mm, current->signal->rlim, size);
+ if (err < 0)
+ return err;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ goto out_refund;
+
+ context->buffer = buffer;
+ context->size = size;
+ context->mm = get_task_mm(current);
+
+ return 0;
+
+ out_refund:
+ refund_locked_memory(current->mm, size);
+ return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+ if (!context->buffer)
+ return;
+
+ kfree(context->buffer);
+ context->buffer = NULL;
+
+ refund_locked_memory(context->mm, context->size);
+ context->size = 0;
+
+ mmput(context->mm);
+ context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+ struct bts_context *context;
+
+ context = container_of(w, struct bts_context, work);
+
+ ds_release_bts(context->tracer);
+ put_task_struct(context->task);
+ free_bts_buffer(context);
+ kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+ INIT_WORK(&context->work, free_bts_context_work);
+ schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+ struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (context) {
+ context->task = task;
+ task->bts = context;
+
+ get_task_struct(task);
+ }
+
+ return context;
+}
+
static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct bts_struct bts;
const unsigned char *at;
int error;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
at = trace->ds.top - ((index + 1) * trace->ds.size);
if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
if (!trace->read)
return -EOPNOTSUPP;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
+ struct bts_context *context;
const struct bts_trace *trace;
const unsigned char *at;
int error, drained = 0;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
if (!trace->read)
return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
for (at = trace->ds.begin; (void *)at < trace->ds.top;
out++, drained++, at += trace->ds.size) {
struct bts_struct bts;
- int error;
- error = trace->read(child->bts, at, &bts);
+ error = trace->read(context->tracer, at, &bts);
if (error < 0)
return error;
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- error = ds_reset_bts(child->bts);
+ error = ds_reset_bts(context->tracer);
if (error < 0)
return error;
return drained;
}
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
- child->bts_buffer = alloc_locked_buffer(size);
- if (!child->bts_buffer)
- return -ENOMEM;
-
- child->bts_size = size;
-
- return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
- free_locked_buffer(child->bts_buffer, child->bts_size);
- child->bts_buffer = NULL;
- child->bts_size = 0;
-}
-
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
struct ptrace_bts_config cfg;
unsigned int flags = 0;
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
return -EFAULT;
- if (child->bts) {
- ds_release_bts(child->bts);
- child->bts = NULL;
- }
+ context = child->bts;
+ if (!context)
+ context = alloc_bts_context(child);
+ if (!context)
+ return -ENOMEM;
if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
if (!cfg.signal)
return -EINVAL;
- child->thread.bts_ovfl_signal = cfg.signal;
return -EOPNOTSUPP;
+ context->bts_ovfl_signal = cfg.signal;
}
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
- (cfg.size != child->bts_size)) {
- int error;
+ ds_release_bts(context->tracer);
+ context->tracer = NULL;
- ptrace_bts_free_buffer(child);
+ if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+ int err;
- error = ptrace_bts_allocate_buffer(child, cfg.size);
- if (error < 0)
- return error;
+ free_bts_buffer(context);
+ if (!cfg.size)
+ return 0;
+
+ err = alloc_bts_buffer(context, cfg.size);
+ if (err < 0)
+ return err;
}
if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
if (cfg.flags & PTRACE_BTS_O_SCHED)
flags |= BTS_TIMESTAMPS;
- child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
- /* ovfl = */ NULL, /* th = */ (size_t)-1,
- flags);
- if (IS_ERR(child->bts)) {
- int error = PTR_ERR(child->bts);
-
- ptrace_bts_free_buffer(child);
- child->bts = NULL;
+ context->tracer =
+ ds_request_bts_task(child, context->buffer, context->size,
+ NULL, (size_t)-1, flags);
+ if (unlikely(IS_ERR(context->tracer))) {
+ int error = PTR_ERR(context->tracer);
+ free_bts_buffer(context);
+ context->tracer = NULL;
return error;
}
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
+ struct bts_context *context;
const struct bts_trace *trace;
struct ptrace_bts_config cfg;
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
if (cfg_size < sizeof(cfg))
return -EIO;
- trace = ds_read_bts(child->bts);
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = child->thread.bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
+ cfg.size = trace->ds.end - trace->ds.begin;
+ cfg.signal = context->bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
if (cfg.signal)
cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
static int ptrace_bts_clear(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
- return ds_reset_bts(child->bts);
+ return ds_reset_bts(context->tracer);
}
static int ptrace_bts_size(struct task_struct *child)
{
+ struct bts_context *context;
const struct bts_trace *trace;
- trace = ds_read_bts(child->bts);
+ context = child->bts;
+ if (!context)
+ return -ESRCH;
+
+ trace = ds_read_bts(context->tracer);
if (!trace)
- return -EPERM;
+ return -ESRCH;
return (trace->ds.top - trace->ds.begin) / trace->ds.size;
}
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
- tsk->bts = NULL;
- tsk->bts_buffer = NULL;
- tsk->bts_size = 0;
- tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
{
if (unlikely(child->bts)) {
- ds_release_bts(child->bts);
+ free_bts_context(child->bts);
child->bts = NULL;
-
- /* We cannot update total_vm and locked_vm since
- child's mm is already gone. But we can reclaim the
- memory. */
- kfree(child->bts_buffer);
- child->bts_buffer = NULL;
- child->bts_size = 0;
}
}
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
- /*
- * Ptrace_detach() races with ptrace_untrace() in case
- * the child dies and is reaped by another thread.
- *
- * We only do the memory accounting at this point and
- * leave the buffer deallocation and the bts tracer
- * release to ptrace_bts_untrace() which will be called
- * later on with tasklist_lock held.
- */
- release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
#endif /* CONFIG_X86_PTRACE_BTS */
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
- ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
- ptrace_bts_untrace(child);
-}
-
/*
* Called by kernel/ptrace.c when detaching..
*
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- ptrace_bts_detach(child);
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 7563b31b4f0..af71d06624b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
break;
}
}
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+ struct pci_dev *nb_ht;
+ unsigned int devfn;
+ u32 val;
+
+ devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+ nb_ht = pci_get_slot(dev->bus, devfn);
+ if (!nb_ht)
+ return;
+
+ pci_read_config_dword(nb_ht, 0x60, &val);
+ set_dev_node(&dev->dev, val & 7);
+ pci_dev_put(dev);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+ quirk_amd_nb_node);
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a..d2d1ce8170f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 360",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ },
+ },
{ /* Handle problems with rebooting on Dell 2400's */
.callback = set_bios_reboot,
.ident = "Dell PowerEdge 2400",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf6..d1c636bf31a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
#define ARCH_SETUP
#endif
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
RESERVE_BRK(dmi_alloc, 65536);
unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
unsigned long mmu_cr4_features = X86_CR4_PAE;
#endif
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
/*
* Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
#endif
saved_video_mode = boot_params.hdr.vid_mode;
bootloader_type = boot_params.hdr.type_of_loader;
+ if ((bootloader_type >> 4) == 0xe) {
+ bootloader_type &= 0xf;
+ bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+ }
+ bootloader_version = bootloader_type & 0xf;
+ bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+ max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
#endif
#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
setup_bios_corruption_check();
#endif
+ printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+
reserve_brk();
/* max_pfn_mapped is updated here */
@@ -997,24 +1015,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- * Perform any necessary interrupt initialisation prior to setting up
- * the "ordinary" interrupt call gates. For legacy reasons, the ISA
- * interrupts should be initialised here if the machine emulates a PC
- * in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
- if (x86_quirks->arch_pre_intr_init) {
- if (x86_quirks->arch_pre_intr_init())
- return;
- }
- init_ISA_irqs();
-}
-
-/**
* x86_quirk_intr_init - post gate setup interrupt initialisation
*
* Description:
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 8f0e13be36b..9c3f0823e6a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+ /*
+ * make sure boot cpu node_number is right, when boot cpu is on the
+ * node that doesn't have mem installed
+ */
+ per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e..0a813b17b17 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
*/
-
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8cca..f6db48c405b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
}
struct smp_ops smp_ops = {
- .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
- .smp_prepare_cpus = native_smp_prepare_cpus,
- .smp_cpus_done = native_smp_cpus_done,
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .smp_cpus_done = native_smp_cpus_done,
- .smp_send_stop = native_smp_send_stop,
- .smp_send_reschedule = native_smp_send_reschedule,
+ .smp_send_stop = native_smp_send_stop,
+ .smp_send_reschedule = native_smp_send_reschedule,
- .cpu_up = native_cpu_up,
- .cpu_die = native_cpu_die,
- .cpu_disable = native_cpu_disable,
- .play_dead = native_play_dead,
+ .cpu_up = native_cpu_up,
+ .cpu_die = native_cpu_die,
+ .cpu_disable = native_cpu_disable,
+ .play_dead = native_play_dead,
- .send_call_func_ipi = native_send_call_func_ipi,
+ .send_call_func_ipi = native_send_call_func_ipi,
.send_call_func_single_ipi = native_send_call_func_single_ipi,
};
EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d..7c80007ea5f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
-int __devinit
+int __cpuinit
wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-int __devinit
+static int __cpuinit
wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
- /*
- * Cleanup possible dangling ends...
- */
- smpboot_restore_warm_reset_vector();
+ if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+ }
return boot_error;
}
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
!cpu_has_apic) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_physical_apicid);
- printk(KERN_ERR "... forcing use of dummy APIC emulation."
+ if (!disable_apic) {
+ pr_err("BIOS bug, local APIC #%d not detected!...\n",
+ boot_cpu_physical_apicid);
+ pr_err("... forcing use of dummy APIC emulation."
"(tell your hw vendor)\n");
+ }
smpboot_clear_io_apic();
arch_disable_smp_support();
return -1;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d..4aaf7e48394 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
static int save_stack_stack(void *data, char *name)
{
- return -1;
+ return 0;
}
static void save_stack_address(void *data, unsigned long addr, int reliable)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b49..d51321ddafd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
.long sys_inotify_init1
.long sys_preadv
.long sys_pwritev
+ .long sys_rt_tgsigqueueinfo /* 335 */
+ .long sys_perf_counter_open
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8c7b03b0cfc..124d40c575d 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
struct bau_desc *adp;
struct bau_desc *ad2;
- adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+ /*
+ * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+ * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+ */
+ adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+ UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
BUG_ON(!adp);
pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
(n << UV_DESC_BASE_PNODE_SHIFT | m));
}
- for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+ /*
+ * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+ * cpu even though we only use the first one; one descriptor can
+ * describe a broadcast to 256 nodes.
+ */
+ for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+ i++, ad2++) {
memset(ad2, 0, sizeof(struct bau_desc));
ad2->header.sw_ack_flag = 1;
/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff..07d60c870ce 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
}
clts(); /* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
- restore_fpu(tsk);
-#else
/*
* Paranoid restore. send a SIGSEGV if we fail to restore the state.
*/
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
force_sig(SIGSEGV, tsk);
return;
}
-#endif
+
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
}
@@ -945,8 +942,13 @@ void __init trap_init(void)
#endif
set_intr_gate(19, &simd_coprocessor_error);
+ /* Reserve all the builtin and the syscall vector: */
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, used_vectors);
+
#ifdef CONFIG_IA32_EMULATION
set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
@@ -963,17 +965,9 @@ void __init trap_init(void)
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
- /* Reserve all the builtin and the syscall vector: */
- for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
- set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
- set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
+
/*
* Should be a barrier for any external CPU state:
*/
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc43..3e1c057e98f 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
{
u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+ unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
int hpet = is_hpet_enabled(), i, loopmin;
- tsc_khz = get_hypervisor_tsc_freq();
- if (tsc_khz) {
+ hv_tsc_khz = get_hypervisor_tsc_freq();
+ if (hv_tsc_khz) {
printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
- return tsc_khz;
+ return hv_tsc_khz;
}
local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
#ifdef CONFIG_X86_64
static cycle_t __vsyscall_fn vread_tsc(void)
{
- cycle_t ret = (cycle_t)vget_cycles();
+ cycle_t ret;
+
+ /*
+ * Surround the RDTSC by barriers, to make sure it's not
+ * speculated to outside the seqlock critical section and
+ * does not cause time warps:
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+ rdtsc_barrier();
return ret >= __vsyscall_gtod_data.clock.cycle_last ?
ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef..027b5b49899 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
* of a critical section, to be able to prove TSC time-warps:
*/
static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
static __cpuinitdata cycles_t last_tsc;
static __cpuinitdata cycles_t max_warp;
static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
return;
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
- printk(KERN_INFO
- "Skipping synchronization checks as TSC is reliable.\n");
+ pr_info("Skipping synchronization checks as TSC is reliable.\n");
return;
}
- printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
- smp_processor_id(), cpu);
+ pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+ smp_processor_id(), cpu);
/*
* Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (nr_warps) {
printk("\n");
- printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
- " turning off TSC clock.\n", max_warp);
+ pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+ "turning off TSC clock.\n", max_warp);
mark_tsc_unstable("check_tsc_sync_source failed");
} else {
printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
while (atomic_read(&stop_count) != cpus)
cpu_relax();
}
-#undef NR_LOOPS
-
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1..9c4e6253905 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
info->regs.pt.ds = 0;
info->regs.pt.es = 0;
info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+ info->regs.pt.gs = 0;
+#endif
/*
* The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
}
/*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
*/
- info->regs32->ax = 0;
+ info->regs32->ax = VM86_SIGNAL;
tsk->thread.saved_sp0 = tsk->thread.sp0;
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
__asm__ __volatile__(
"movl %0,%%esp\n\t"
"movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
"mov %2, %%gs\n\t"
+#endif
"jmp resume_userspace"
: /* no outputs */
:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95deb9f2211..b263423fbe2 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
}
#endif
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
{
- paravirt_enter_lazy_cpu();
+ paravirt_start_context_switch(prev);
vmi_ops.set_lazy_mode(2);
}
+static void vmi_end_context_switch(struct task_struct *next)
+{
+ vmi_ops.set_lazy_mode(0);
+ paravirt_end_context_switch(next);
+}
+
static void vmi_enter_lazy_mmu(void)
{
paravirt_enter_lazy_mmu();
vmi_ops.set_lazy_mode(1);
}
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
vmi_ops.set_lazy_mode(0);
+ paravirt_leave_lazy_mmu();
}
static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
para_fill(pv_cpu_ops.io_delay, IODelay);
- para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+ para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
set_lazy_mode, SetLazyMode);
para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
set_lazy_mode, SetLazyMode);
- para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+ para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
set_lazy_mode, SetLazyMode);
/* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f01..4c85b2e2bb6 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,431 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
+#define LOAD_OFFSET __PAGE_OFFSET
#else
-# include "vmlinux_64.lds.S"
+#define LOAD_OFFSET __START_KERNEL_map
#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386 /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_X86_64
+ user PT_LOAD FLAGS(7); /* RWE */
+ data.init PT_LOAD FLAGS(7); /* RWE */
+#ifdef CONFIG_SMP
+ percpu PT_LOAD FLAGS(7); /* RWE */
+#endif
+ data.init2 PT_LOAD FLAGS(7); /* RWE */
+#endif
+ note PT_NOTE FLAGS(0); /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+ . = __START_KERNEL;
+ phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
+ /* Text and read-only data */
+
+ /* bootstrapping code */
+ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+ _text = .;
+ *(.text.head)
+ } :text = 0x9090
+
+ /* The rest of the text */
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+ /* not really needed, already page aligned */
+ . = ALIGN(PAGE_SIZE);
+ *(.text.page_aligned)
+#endif
+ . = ALIGN(8);
+ _stext = .;
+ TEXT_TEXT
+ SCHED_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ IRQENTRY_TEXT
+ *(.fixup)
+ *(.gnu.warning)
+ /* End of text section */
+ _etext = .;
+ } :text = 0x9090
+
+ NOTES :text :note
+
+ /* Exception table */
+ . = ALIGN(16);
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+ __start___ex_table = .;
+ *(__ex_table)
+ __stop___ex_table = .;
+ } :text = 0x9090
+
+ RODATA
+
+ /* Data */
+ . = ALIGN(PAGE_SIZE);
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
+ DATA_DATA
+ CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+ /* End of data section */
+ _edata = .;
+#endif
+ } :data
+
+#ifdef CONFIG_X86_32
+ /* 32 bit has nosave before _edata */
+ . = ALIGN(PAGE_SIZE);
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
+ }
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
+ *(.data.idt)
+ }
+
+#ifdef CONFIG_X86_32
+ . = ALIGN(32);
+#else
+ . = ALIGN(PAGE_SIZE);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+ .data.cacheline_aligned :
+ AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
+
+ /* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+ . = ALIGN(32);
+#else
+ . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+ .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+ *(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+ /* End of data section */
+ _edata = .;
+#endif
+ }
+
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+ SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+ SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+ *(.vsyscall_0)
+ } :user
+
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+ *(.vsyscall_fn)
+ }
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+ *(.vsyscall_gtod_data)
+ }
+
+ vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+ .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+ *(.vsyscall_clock)
+ }
+ vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+ *(.vsyscall_1)
+ }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+ *(.vsyscall_2)
+ }
+
+ .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+ *(.vgetcpu_mode)
+ }
+ vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ .jiffies : AT(VLOAD(.jiffies)) {
+ *(.jiffies)
+ }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+ *(.vsyscall_3)
+ }
+
+ . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
+
+ /* init_task */
+ . = ALIGN(THREAD_SIZE);
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ *(.data.init_task)
+ }
+#ifdef CONFIG_X86_64
+ :data.init
+#endif
+
+ /*
+ * smp_locks might be freed after init
+ * start/end must be page aligned
+ */
+ . = ALIGN(PAGE_SIZE);
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ __smp_locks = .;
+ *(.smp_locks)
+ __smp_locks_end = .;
+ . = ALIGN(PAGE_SIZE);
+ }
+
+ /* Init code and data - will be freed after init */
+ . = ALIGN(PAGE_SIZE);
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+ __init_begin = .; /* paired with __init_end */
+ _sinittext = .;
+ INIT_TEXT
+ _einittext = .;
+ }
+
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+ INIT_DATA
+ }
+
+ . = ALIGN(16);
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+ __setup_start = .;
+ *(.init.setup)
+ __setup_end = .;
+ }
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+ __initcall_start = .;
+ INITCALLS
+ __initcall_end = .;
+ }
+
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ __con_initcall_start = .;
+ *(.con_initcall.init)
+ __con_initcall_end = .;
+ }
+
+ .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+ __x86_cpu_dev_start = .;
+ *(.x86_cpu_dev.init)
+ __x86_cpu_dev_end = .;
+ }
+
+ SECURITY_INIT
+
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
+
+ . = ALIGN(8);
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ __alt_instructions = .;
+ *(.altinstructions)
+ __alt_instructions_end = .;
+ }
+
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
+
+ /*
+ * .exit.text is discard at runtime, not link time, to deal with
+ * references from .altinstructions and .eh_frame
+ */
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+ EXIT_TEXT
+ }
+
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+ EXIT_DATA
+ }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ . = ALIGN(PAGE_SIZE);
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+ __initramfs_start = .;
+ *(.init.ramfs)
+ __initramfs_end = .;
+ }
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+ /*
+ * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
+ * output PHDR, so the next output section - __data_nosave - should
+ * start another section data.init2. Also, pda should be at the head of
+ * percpu area. Preallocate it and define the percpu offset symbol
+ * so that it can be accessed as a percpu variable.
+ */
+ . = ALIGN(PAGE_SIZE);
+ PERCPU_VADDR(0, :percpu)
+#else
+ PERCPU(PAGE_SIZE)
+#endif
+
+ . = ALIGN(PAGE_SIZE);
+
+ /* freed after init ends here */
+ .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+ __init_end = .;
+ }
+
+#ifdef CONFIG_X86_64
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+ . = ALIGN(PAGE_SIZE);
+ __nosave_begin = .;
+ *(.data.nosave)
+ . = ALIGN(PAGE_SIZE);
+ __nosave_end = .;
+ } :data.init2
+ /* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+ /* BSS */
+ . = ALIGN(PAGE_SIZE);
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ __bss_start = .;
+ *(.bss.page_aligned)
+ *(.bss)
+ . = ALIGN(4);
+ __bss_stop = .;
+ }
+
+ . = ALIGN(PAGE_SIZE);
+ .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+ __brk_base = .;
+ . += 64 * 1024; /* 64k alignment slop space */
+ *(.brk_reservation) /* areas brk users have reserved */
+ __brk_limit = .;
+ }
+
+ .end : AT(ADDR(.end) - LOAD_OFFSET) {
+ _end = .;
+ }
+
+ /* Sections to be discarded */
+ /DISCARD/ : {
+ *(.exitcall.exit)
+ *(.eh_frame)
+ *(.discard)
+ }
+
+ STABS_DEBUG
+ DWARF_DEBUG
+}
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+ "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 62ad500d55f..00000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,229 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
-
- .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- *(.text.head)
- } :text = 0x9090
-
- /* read-only */
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
- *(.text.page_aligned)
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- /* writeable */
- . = ALIGN(PAGE_SIZE);
- .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
- DATA_DATA
- CONSTRUCTORS
- } :data
-
- . = ALIGN(PAGE_SIZE);
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- }
-
- . = ALIGN(PAGE_SIZE);
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- *(.data.page_aligned)
- *(.data.idt)
- }
-
- . = ALIGN(32);
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- *(.data.cacheline_aligned)
- }
-
- /* rarely changed data like cpu maps */
- . = ALIGN(32);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- _edata = .; /* End of data section */
- }
-
- . = ALIGN(THREAD_SIZE); /* init_task */
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- *(.data.init_task)
- }
-
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- }
- /* will be freed after init
- * Following ALIGN() is required to make sure no other data falls on the
- * same page where __smp_alt_end is pointing as that page might be freed
- * after boot. Always make sure that ALIGN() directive is present after
- * the section which contains __smp_alt_end.
- */
- . = ALIGN(PAGE_SIZE);
-
- /* will be freed after init */
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- __init_begin = .;
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- INIT_DATA
- }
- . = ALIGN(16);
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
- . = ALIGN(4);
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- . = ALIGN(4);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-#if defined(CONFIG_BLK_DEV_INITRD)
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
- PERCPU(PAGE_SIZE)
- . = ALIGN(PAGE_SIZE);
- /* freed after init ends here */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- __init_end = .;
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- . = ALIGN(4);
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- .end : AT(ADDR(.end) - LOAD_OFFSET) {
- _end = . ;
- }
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index c8742507b03..00000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,298 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386 /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-PHDRS {
- text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
- user PT_LOAD FLAGS(7); /* RWE */
- data.init PT_LOAD FLAGS(7); /* RWE */
-#ifdef CONFIG_SMP
- percpu PT_LOAD FLAGS(7); /* RWE */
-#endif
- data.init2 PT_LOAD FLAGS(7); /* RWE */
- note PT_NOTE FLAGS(0); /* ___ */
-}
-SECTIONS
-{
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
- .text : AT(ADDR(.text) - LOAD_OFFSET) {
- _text = .; /* Text and read-only data */
- /* First the code that has to be first for bootstrapping */
- *(.text.head)
- _stext = .;
- /* Then the rest */
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- IRQENTRY_TEXT
- *(.fixup)
- *(.gnu.warning)
- _etext = .; /* End of text section */
- } :text = 0x9090
-
- NOTES :text :note
-
- . = ALIGN(16); /* Exception table */
- __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
- __start___ex_table = .;
- *(__ex_table)
- __stop___ex_table = .;
- } :text = 0x9090
-
- RODATA
-
- . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
- /* Data */
- .data : AT(ADDR(.data) - LOAD_OFFSET) {
- DATA_DATA
- CONSTRUCTORS
- _edata = .; /* End of data section */
- } :data
-
-
- .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- *(.data.cacheline_aligned)
- }
- . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
- .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- *(.data.read_mostly)
- }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
- . = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
- __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
- { *(.vsyscall_gtod_data) }
- vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
- .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
- { *(.vsyscall_clock) }
- vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
- { *(.vsyscall_1) }
- .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
- { *(.vsyscall_2) }
-
- .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
- vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
- jiffies = VVIRT(.jiffies);
-
- .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
- { *(.vsyscall_3) }
-
- . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
- .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- . = ALIGN(THREAD_SIZE); /* init_task */
- *(.data.init_task)
- }:data.init
-
- .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- *(.data.page_aligned)
- }
-
- .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- /* might get freed after init */
- . = ALIGN(PAGE_SIZE);
- __smp_alt_begin = .;
- __smp_locks = .;
- *(.smp_locks)
- __smp_locks_end = .;
- . = ALIGN(PAGE_SIZE);
- __smp_alt_end = .;
- }
-
- . = ALIGN(PAGE_SIZE); /* Init code and data */
- __init_begin = .; /* paired with __init_end */
- .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- _sinittext = .;
- INIT_TEXT
- _einittext = .;
- }
- .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
- __initdata_begin = .;
- INIT_DATA
- __initdata_end = .;
- }
-
- .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
- . = ALIGN(16);
- __setup_start = .;
- *(.init.setup)
- __setup_end = .;
- }
- .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
- __initcall_start = .;
- INITCALLS
- __initcall_end = .;
- }
- .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
- __con_initcall_start = .;
- *(.con_initcall.init)
- __con_initcall_end = .;
- }
- .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
- __x86_cpu_dev_start = .;
- *(.x86_cpu_dev.init)
- __x86_cpu_dev_end = .;
- }
- SECURITY_INIT
-
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
- .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
- . = ALIGN(8);
- __alt_instructions = .;
- *(.altinstructions)
- __alt_instructions_end = .;
- }
- .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
- *(.altinstr_replacement)
- }
- /* .exit.text is discard at runtime, not link time, to deal with references
- from .altinstructions and .eh_frame */
- .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
- EXIT_TEXT
- }
- .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
- EXIT_DATA
- }
-
-#ifdef CONFIG_BLK_DEV_INITRD
- . = ALIGN(PAGE_SIZE);
- .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
- __initramfs_start = .;
- *(.init.ramfs)
- __initramfs_end = .;
- }
-#endif
-
-#ifdef CONFIG_SMP
- /*
- * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
- * output PHDR, so the next output section - __data_nosave - should
- * start another section data.init2. Also, pda should be at the head of
- * percpu area. Preallocate it and define the percpu offset symbol
- * so that it can be accessed as a percpu variable.
- */
- . = ALIGN(PAGE_SIZE);
- PERCPU_VADDR(0, :percpu)
-#else
- PERCPU(PAGE_SIZE)
-#endif
-
- . = ALIGN(PAGE_SIZE);
- __init_end = .;
-
- .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __nosave_begin = .;
- *(.data.nosave)
- . = ALIGN(PAGE_SIZE);
- __nosave_end = .;
- } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __bss_start = .; /* BSS */
- *(.bss.page_aligned)
- *(.bss)
- __bss_stop = .;
- }
-
- .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
- . = ALIGN(PAGE_SIZE);
- __brk_base = . ;
- . += 64 * 1024 ; /* 64k alignment slop space */
- *(.brk_reservation) /* areas brk users have reserved */
- __brk_limit = . ;
- }
-
- _end = . ;
-
- /* Sections to be discarded */
- /DISCARD/ : {
- *(.exitcall.exit)
- *(.eh_frame)
- *(.discard)
- }
-
- STABS_DEBUG
-
- DWARF_DEBUG
-}
-
- /*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
- "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
- "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc906..25ee06a80aa 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
return;
}
- /*
- * Surround the RDTSC by barriers, to make sure it's not
- * speculated to outside the seqlock critical section and
- * does not cause time warps:
- */
- rdtsc_barrier();
now = vread();
- rdtsc_barrier();
-
base = __vsyscall_gtod_data.clock.cycle_last;
mask = __vsyscall_gtod_data.clock.mask;
mult = __vsyscall_gtod_data.clock.mult;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 33a93b41739..4e0c2655939 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
* issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
+{
+ kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_end_context_switch(next);
}
/*G:033
@@ -637,7 +643,7 @@ static void __init lguest_init_IRQ(void)
void lguest_setup_irq(unsigned int irq)
{
- irq_to_desc_alloc_cpu(irq, 0);
+ irq_to_desc_alloc_node(irq, 0);
set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
}
@@ -1054,8 +1060,8 @@ __init void lguest_init(void)
pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
- pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+ pv_cpu_ops.end_context_switch = lguest_end_context_switch;
/* pagetable management */
pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1068,7 +1074,7 @@ __init void lguest_init(void)
pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.pte_update = lguest_pte_update;
pv_mmu_ops.pte_update_defer = lguest_pte_update;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb4..a725b7f760a 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
+ int width = sizeof(unsigned long) * 2;
/*
* Now print the actual finished series
*/
- seq_printf(m, "0x%p-0x%p ",
- (void *)st->start_address,
- (void *)st->current_address);
+ seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa..c6acc632637 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,17 @@
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h> /* STACK_END_MAGIC */
+#include <linux/sched.h> /* test_thread_flag(), ... */
+#include <linux/kdebug.h> /* oops_begin/end, ... */
+#include <linux/module.h> /* search_exception_table */
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/kprobes.h> /* __kprobes, ... */
+#include <linux/mmiotrace.h> /* kmmio_handler, ... */
+#include <linux/perf_counter.h> /* perf_swcounter_event */
+
+#include <asm/traps.h> /* dotraplinkage, ... */
+#include <asm/pgalloc.h> /* pgd_*(), ... */
/*
* Page fault error code bits:
@@ -225,12 +202,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
if (!pmd_present(*pmd_k))
return NULL;
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(*pmd))
set_pmd(pmd, *pmd_k);
- arch_flush_lazy_mmu_mode();
- } else {
+ else
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
- }
return pmd_k;
}
@@ -538,8 +513,6 @@ bad:
static int is_errata93(struct pt_regs *regs, unsigned long address)
{
#ifdef CONFIG_X86_64
- static int once;
-
if (address != regs->ip)
return 0;
@@ -549,10 +522,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
address |= 0xffffffffUL << 32;
if ((address >= (u64)_stext && address <= (u64)_etext) ||
(address >= MODULES_VADDR && address <= MODULES_END)) {
- if (!once) {
- printk(errata93_warning);
- once = 1;
- }
+ printk_once(errata93_warning);
regs->ip = address;
return 1;
}
@@ -1044,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
@@ -1137,10 +1109,15 @@ good_area:
return;
}
- if (fault & VM_FAULT_MAJOR)
+ if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
- else
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ regs, address);
+ } else {
tsk->min_flt++;
+ perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ regs, address);
+ }
check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 8126e8d1a2a..58f621e8191 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
- arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
#endif
}
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ae4f7b5d710..34c1bfb64f1 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
@@ -10,6 +11,9 @@
#include <asm/setup.h>
#include <asm/system.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long __initdata e820_table_start;
unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
#endif
;
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on Enable
+ * off Disable
+ */
+static int __init noexec_setup(char *str)
+{
+ if (!str)
+ return -EINVAL;
+ if (!strncmp(str, "on", 2)) {
+ __supported_pte_mask |= _PAGE_NX;
+ disable_nx = 0;
+ } else if (!strncmp(str, "off", 3)) {
+ disable_nx = 1;
+ __supported_pte_mask &= ~_PAGE_NX;
+ }
+ return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+ unsigned int v[4], l, h;
+
+ if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+ cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+ if ((v[3] & (1 << 20)) && !disable_nx) {
+ rdmsr(MSR_EFER, l, h);
+ l |= EFER_NX;
+ wrmsr(MSR_EFER, l, h);
+ nx_enabled = 1;
+ __supported_pte_mask |= _PAGE_NX;
+ }
+ }
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+ unsigned long efer;
+
+ rdmsrl(MSR_EFER, efer);
+ if (!(efer & EFER_NX) || disable_nx)
+ __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
static void __init find_early_table_space(unsigned long end, int use_pse,
int use_gbpages)
{
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
*/
#ifdef CONFIG_X86_32
start = 0x7000;
- e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
- tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
start = 0x8000;
- e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
#endif
+ e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+ tables, PAGE_SIZE);
if (e820_table_start == -1UL)
panic("Cannot find space for the kernel page tables");
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
use_gbpages = direct_gbpages;
#endif
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
/* Enable PSE if available */
if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
-#endif
if (use_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f..949708d7a48 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,12 +49,9 @@
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
+#include <asm/page_types.h>
#include <asm/init.h>
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
@@ -587,61 +584,9 @@ void zap_low_mappings(void)
flush_tlb_all();
}
-int nx_enabled;
-
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on Enable
- * off Disable
- */
-static int __init noexec_setup(char *str)
-{
- if (!str || !strcmp(str, "on")) {
- if (cpu_has_nx) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- }
- } else {
- if (!strcmp(str, "off")) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- } else {
- return -EINVAL;
- }
- }
-
- return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
- unsigned int v[4], l, h;
-
- if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
- cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
- if ((v[3] & (1 << 20)) && !disable_nx) {
- rdmsr(MSR_EFER, l, h);
- l |= EFER_NX;
- wrmsr(MSR_EFER, l, h);
- nx_enabled = 1;
- __supported_pte_mask |= _PAGE_NX;
- }
- }
-}
-#endif
-
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
@@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- memory_present(0, 0, highend_pfn);
e820_register_active_regions(0, 0, highend_pfn);
+ sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- memory_present(0, 0, max_low_pfn);
e820_register_active_regions(0, 0, max_low_pfn);
+ sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df..52bb9519bb8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,18 +50,8 @@
#include <asm/cacheflush.h>
#include <asm/init.h>
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
static unsigned long dma_reserve __initdata;
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
static int __init parse_direct_gbpages_off(char *arg)
{
direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on Enable (default)
- * off Disable
- */
-static int __init nonx_setup(char *str)
-{
- if (!str)
- return -EINVAL;
- if (!strncmp(str, "on", 2)) {
- __supported_pte_mask |= _PAGE_NX;
- disable_nx = 0;
- } else if (!strncmp(str, "off", 3)) {
- disable_nx = 1;
- __supported_pte_mask &= ~_PAGE_NX;
- }
- return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
- unsigned long efer;
-
- rdmsrl(MSR_EFER, efer);
- if (!(efer & EFER_NX) || disable_nx)
- __supported_pte_mask &= ~_PAGE_NX;
-}
-
int force_personality32;
/*
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
+#endif
void __init paging_init(void)
{
@@ -638,11 +596,10 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
max_zone_pfns[ZONE_NORMAL] = max_pfn;
- memory_present(0, 0, max_pfn);
+ sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
free_area_init_nodes(max_zone_pfns);
}
-#endif
/*
* Memory hotplug specific functions
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 8056545e2d3..fe6f84ca121 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
kpte_clear_flush(kmap_pte-idx, vaddr);
- arch_flush_lazy_mmu_mode();
pagefault_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 50dc802a1c4..16ccbd77917 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
- bool old_presence; /* page presence prior to arming */
+ pteval_t old_presence; /* page presence prior to arming */
bool armed;
/*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
{
struct list_head *head;
- struct kmmio_fault_page *p;
+ struct kmmio_fault_page *f;
page &= PAGE_MASK;
head = kmmio_page_list(page);
- list_for_each_entry_rcu(p, head, list) {
- if (p->page == page)
- return p;
+ list_for_each_entry_rcu(f, head, list) {
+ if (f->page == page)
+ return f;
}
return NULL;
}
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
pmdval_t v = pmd_val(*pmd);
- *old = !!(v & _PAGE_PRESENT);
- v &= ~_PAGE_PRESENT;
- if (present)
- v |= _PAGE_PRESENT;
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
set_pmd(pmd, __pmd(v));
}
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
- *old = !!(v & _PAGE_PRESENT);
- v &= ~_PAGE_PRESENT;
- if (present)
- v |= _PAGE_PRESENT;
+ if (clear) {
+ *old = v & _PAGE_PRESENT;
+ v &= ~_PAGE_PRESENT;
+ } else /* presume this has been called with clear==true previously */
+ v |= *old;
set_pte_atomic(pte, __pte(v));
}
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
- pte_t *pte = lookup_address(addr, &level);
+ pte_t *pte = lookup_address(f->page, &level);
if (!pte) {
- pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+ pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
return -1;
}
switch (level) {
case PG_LEVEL_2M:
- set_pmd_presence((pmd_t *)pte, present, old);
+ clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
break;
case PG_LEVEL_4K:
- set_pte_presence(pte, present, old);
+ clear_pte_presence(pte, clear, &f->old_presence);
break;
default:
pr_err("kmmio: unexpected page level 0x%x.\n", level);
return -1;
}
- __flush_tlb_one(addr);
+ __flush_tlb_one(f->page);
return 0;
}
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
if (f->armed) {
pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
- f->page, f->count, f->old_presence);
+ f->page, f->count, !!f->old_presence);
}
- ret = set_page_presence(f->page, false, &f->old_presence);
+ ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
f->armed = true;
return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
/** Restore the given page to saved presence state. */
static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
- bool tmp;
- int ret = set_page_presence(f->page, f->old_presence, &tmp);
+ int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
- pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+ /*
+ * debug traps without an active context are due to either
+ * something external causing them (f.e. using a debugger while
+ * mmio tracing enabled), or erroneous behaviour
+ */
+ pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
head,
struct kmmio_delayed_release,
rcu);
- struct kmmio_fault_page *p = dr->release_list;
- while (p) {
- struct kmmio_fault_page *next = p->release_next;
- BUG_ON(p->count);
- kfree(p);
- p = next;
+ struct kmmio_fault_page *f = dr->release_list;
+ while (f) {
+ struct kmmio_fault_page *next = f->release_next;
+ BUG_ON(f->count);
+ kfree(f);
+ f = next;
}
kfree(dr);
}
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr =
container_of(head, struct kmmio_delayed_release, rcu);
- struct kmmio_fault_page *p = dr->release_list;
+ struct kmmio_fault_page *f = dr->release_list;
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
spin_lock_irqsave(&kmmio_lock, flags);
- while (p) {
- if (!p->count) {
- list_del_rcu(&p->list);
- prevp = &p->release_next;
+ while (f) {
+ if (!f->count) {
+ list_del_rcu(&f->list);
+ prevp = &f->release_next;
} else {
- *prevp = p->release_next;
+ *prevp = f->release_next;
}
- p = p->release_next;
+ f = f->release_next;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
}
EXPORT_SYMBOL(unregister_kmmio_probe);
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
- void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
{
struct die_args *arg = args;
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
.notifier_call = kmmio_die_notifier
};
-static int __init init_kmmio(void)
+int kmmio_init(void)
{
int i;
+
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
INIT_LIST_HEAD(&kmmio_page_table[i]);
+
return register_die_notifier(&nb_die);
}
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+ int i;
+
+ unregister_die_notifier(&nb_die);
+ for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+ WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+ KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+ }
+}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be0621..c0bedcd10f9 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
{
- u64 i, count;
- u64 *start;
+ u64 *p;
+ void *start, *end;
u64 start_bad, last_bad;
u64 start_phys_aligned;
size_t incr;
incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
- count = (size - (start_phys_aligned - start_phys))/incr;
start = __va(start_phys_aligned);
+ end = start + size - (start_phys_aligned - start_phys);
start_bad = 0;
last_bad = 0;
- for (i = 0; i < count; i++)
- start[i] = pattern;
- for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
- if (*start == pattern)
+ for (p = start; p < end; p++)
+ *p = pattern;
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
continue;
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b40..132772a8ec5 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
if (nommiotrace)
pr_info(NAME "MMIO tracing disabled.\n");
+ kmmio_init();
enter_uniprocessor();
spin_lock_irq(&trace_lock);
atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
+ kmmio_cleanup();
pr_info(NAME "disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029d..459913beac7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
}
/* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
- unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
- const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
int nid;
if (!end)
return;
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ return;
+
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
-#ifdef CONFIG_ACPI_NUMA
- srat_reserve_add_area(nodeid);
-#endif
node_set_online(nodeid);
}
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-void __init paging_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
-
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
- sparse_init();
-
- free_area_init_nodes(max_zone_pfns);
-}
-
static __init int numa_setup(char *opt)
{
if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt, "noacpi", 6))
acpi_numa = -1;
- if (!strncmp(opt, "hotadd=", 7))
- hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 0;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e17efed088c..6ce9518fe2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
vm_unmap_aliases();
- /*
- * If we're called with lazy mmu updates enabled, the
- * in-memory pte state may be stale. Flush pending updates to
- * bring them up to date.
- */
- arch_flush_lazy_mmu_mode();
-
cpa.vaddr = addr;
cpa.pages = pages;
cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
} else
cpa_flush_all(cache);
- /*
- * If we've been called with lazy mmu updates enabled, then
- * make sure that everything gets flushed out before we
- * return.
- */
- arch_flush_lazy_mmu_mode();
-
out:
return ret;
}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baa..2dfcbf9df2a 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
static nodemask_t cpu_nodes_parsed __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
static int num_node_memblks __initdata;
static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
-/* Too small nodes confuse the VM badly. Usually they result
- from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
- if (found_add_area)
- return;
-
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
int i;
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
- found_add_area = 0;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
pxm, apic_id, node);
}
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
/*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
*/
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
{
unsigned long s_pfn = start >> PAGE_SHIFT;
unsigned long e_pfn = end >> PAGE_SHIFT;
- int ret = 0, changed = 0;
+ int changed = 0;
struct bootnode *nd = &nodes_add[node];
/* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
mistakes */
if ((signed long)(end - start) < NODE_MIN_SIZE) {
printk(KERN_ERR "SRAT: Hotplug area too small\n");
- return -1;
+ return;
}
/* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
printk(KERN_ERR
"SRAT: Hotplug area %lu -> %lu has existing memory\n",
s_pfn, e_pfn);
- return -1;
- }
-
- if (!hotadd_enough_memory(&nodes_add[node])) {
- printk(KERN_ERR "SRAT: Hotplug area too large\n");
- return -1;
+ return;
}
/* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
}
- ret = update_end_of_memory(nd->end);
-
if (changed)
- printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
- return ret;
+ printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+ nd->start, nd->end);
}
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
start, end);
e820_register_active_regions(node, start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
- push_node_boundaries(node, nd->start >> PAGE_SHIFT,
- nd->end >> PAGE_SHIFT);
- if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
- (reserve_hotadd(node, start, end) < 0)) {
- /* Ignore hotadd region. Undo damage */
- printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+ if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+ update_nodes_add(node, start, end);
+ /* restore nodes[node] */
*nd = oldnode;
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
pxmram = 0;
}
- e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
- /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
- if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+ e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+ /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+ if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
printk(KERN_ERR
"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
(pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
return 1;
}
-static void __init unparse_node(int node)
-{
- int i;
- node_clear(node, nodes_parsed);
- node_clear(node, cpu_nodes_parsed);
- for (i = 0; i < MAX_LOCAL_APIC; i++) {
- if (apicid_to_node[i] == node)
- apicid_to_node[i] = NUMA_NO_NODE;
- }
-}
-
void __init acpi_numa_arch_fixup(void) {}
/* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
/* First clean up the node list */
- for (i = 0; i < MAX_NUMNODES; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
- /*
- * don't confuse VM with a node that doesn't have the
- * minimum memory.
- */
- if (nodes[i].end &&
- (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
- unparse_node(i);
- node_set_offline(i);
- }
- }
if (!nodes_cover_memory(nodes)) {
bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
if (node == NUMA_NO_NODE)
continue;
- if (!node_isset(node, node_possible_map))
+ if (!node_online(node))
numa_clear_node(i);
}
numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
}
#endif /* CONFIG_NUMA_EMU */
-void __init srat_reserve_add_area(int nodeid)
-{
- if (found_add_area && nodes_add[nodeid].end) {
- u64 total_mb;
-
- printk(KERN_INFO "SRAT: Reserving hot-add memory space "
- "for node %d at %Lx-%Lx\n",
- nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
- total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
- >> PAGE_SHIFT;
- total_mb *= sizeof(struct page);
- total_mb >>= 20;
- printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
- "pre-allocated memory.\n", (unsigned long long)total_mb);
- reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
- nodes_add[nodeid].end - nodes_add[nodeid].start,
- BOOTMEM_DEFAULT);
- }
-}
-
int __node_distance(int a, int b)
{
int index;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 80b63d5db50..7826dfcc842 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -60,8 +60,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
- ret = NOTIFY_STOP;
+ case DIE_NMI_IPI:
+ model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+ ret = NOTIFY_STOP;
break;
default:
break;
@@ -146,7 +147,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = 2
};
static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 0a261a5c696..cd72d5c73b4 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -118,6 +118,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
u64 val;
int i;
+ /*
+ * This can happen if perf counters are in use when
+ * we steal the die notifier NMI.
+ */
+ if (unlikely(!reset_value))
+ goto out;
+
for (i = 0 ; i < num_counters; ++i) {
if (!reset_value[i])
continue;
@@ -128,6 +135,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
wrmsrl(msrs->counters[i].addr, -reset_value[i]);
}
+out:
/* Only P6 based Pentium M need to re-unmask the apic vector but it
* doesn't hurt other P6 variant */
apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index fda52b4c1b9..505489873b9 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -13,7 +13,7 @@
#define OP_X86_MODEL_H
#include <asm/types.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
struct op_msr {
unsigned long addr;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7..0696d506c4a 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
return 0;
}
+ if (io_apic_assign_pci_irqs)
+ return 0;
+
/* Find IRQ routing entry */
if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
pirq_penalty[dev->irq]++;
}
+ if (io_apic_assign_pci_irqs)
+ return;
+
dev = NULL;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (!pin)
continue;
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Recalculate IRQ numbers if we use the I/O APIC.
- */
- if (io_apic_assign_pci_irqs) {
- int irq;
-
- /*
- * interrupt pins are numbered starting from 1
- */
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn), pin - 1);
- /*
- * Busses behind bridges are typically not listed in the
- * MP-table. In this case we have to look up the IRQ
- * based on the parent bus, parent slot, and pin number.
- * The SMP code detects such bridged busses itself so we
- * should get into this branch reliably.
- */
- if (irq < 0 && dev->bus->parent) {
- /* go back to the bridge */
- struct pci_dev *bridge = dev->bus->self;
- int bus;
-
- pin = pci_swizzle_interrupt_pin(dev, pin);
- bus = bridge->bus->number;
- irq = IO_APIC_get_PCI_irq_vector(bus,
- PCI_SLOT(bridge->devfn), pin - 1);
- if (irq >= 0)
- dev_warn(&dev->dev,
- "using bridge %s INT %c to "
- "get IRQ %d\n",
- pci_name(bridge),
- 'A' + pin - 1, irq);
- }
- if (irq >= 0) {
- dev_info(&dev->dev,
- "PCI->APIC IRQ transform: INT %c "
- "-> IRQ %d\n",
- 'A' + pin - 1, irq);
- dev->irq = irq;
- }
- }
-#endif
/*
* Still no IRQ? Try to lookup one...
*/
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
pcibios_enable_irq = pirq_enable_irq;
pcibios_fixup_irqs();
+
+ if (io_apic_assign_pci_irqs && pci_routeirq) {
+ struct pci_dev *dev = NULL;
+ /*
+ * PCI IRQ routing is set up by pci_enable_device(), but we
+ * also do it here in case there are still broken drivers that
+ * don't use pci_enable_device().
+ */
+ printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+ for_each_pci_dev(dev)
+ pirq_enable_irq(dev);
+ }
+
return 0;
}
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
static int pirq_enable_irq(struct pci_dev *dev)
{
u8 pin;
- struct pci_dev *temp_dev;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
- if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+ if (pin && !pcibios_lookup_irq(dev, 1)) {
char *msg = "";
+ if (!io_apic_assign_pci_irqs && dev->irq)
+ return 0;
+
if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+ struct pci_dev *temp_dev;
int irq;
+ struct io_apic_irq_attr irq_attr;
- irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+ irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+ PCI_SLOT(dev->devfn),
+ pin - 1, &irq_attr);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
pin = pci_swizzle_interrupt_pin(dev, pin);
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
- PCI_SLOT(bridge->devfn), pin - 1);
+ PCI_SLOT(bridge->devfn),
+ pin - 1, &irq_attr);
if (irq >= 0)
dev_warn(&dev->dev, "using bridge %s "
"INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
+ io_apic_set_pci_routing(&dev->dev, irq,
+ &irq_attr);
+ dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
- dev->irq = irq;
return 0;
} else
msg = "; probably buggy MP table";
+#endif
} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
msg = "";
else
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab5..58bc00f68b1 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
}
}
+ current->mm->context.vdso = (void *)addr;
+
if (compat_uses_vma || !compat) {
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
- current->mm->context.vdso = (void *)addr;
current_thread_info()->sysenter_return =
VDSO32_SYMBOL(addr, SYSENTER_RETURN);
up_fail:
+ if (ret)
+ current->mm->context.vdso = NULL;
+
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098..21e1aeb9f3e 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/elf.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
@@ -115,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
goto up_fail;
}
+ current->mm->context.vdso = (void *)addr;
+
ret = install_special_mapping(mm, addr, vdso_size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
VM_ALWAYSDUMP,
vdso_pages);
- if (ret)
+ if (ret) {
+ current->mm->context.vdso = NULL;
goto up_fail;
+ }
- current->mm->context.vdso = (void *)addr;
up_fail:
up_write(&mm->mmap_sem);
return ret;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09e8c36ee8..0a1700a2be9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
#include <linux/delay.h>
#include <linux/start_kernel.h>
#include <linux/sched.h>
+#include <linux/kprobes.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/mm.h>
@@ -44,6 +45,7 @@
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/msr-index.h>
+#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
-void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
xen_mc_flush();
+ paravirt_end_context_switch(next);
}
static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
+ unsigned long addr;
+
if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
return 0;
info->vector = vector;
- info->address = gate_offset(*val);
+
+ addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+ /*
+ * Look for known traps using IST, and substitute them
+ * appropriately. The debugger ones are the only ones we care
+ * about. Xen will handle faults like double_fault and
+ * machine_check, so we should never see them. Warn if
+ * there's an unexpected IST-using fault handler.
+ */
+ if (addr == (unsigned long)debug)
+ addr = (unsigned long)xen_debug;
+ else if (addr == (unsigned long)int3)
+ addr = (unsigned long)xen_int3;
+ else if (addr == (unsigned long)stack_segment)
+ addr = (unsigned long)xen_stack_segment;
+ else if (addr == (unsigned long)double_fault ||
+ addr == (unsigned long)nmi) {
+ /* Don't need to handle these */
+ return 0;
+#ifdef CONFIG_X86_MCE
+ } else if (addr == (unsigned long)machine_check) {
+ return 0;
+#endif
+ } else {
+ /* Some other trap using IST? */
+ if (WARN_ON(val->ist != 0))
+ return 0;
+ }
+#endif /* CONFIG_X86_64 */
+ info->address = addr;
+
info->cs = gate_segment(*val);
info->flags = val->dpl;
/* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
xen_mc_issue(PARAVIRT_LAZY_CPU);
}
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+ unsigned long cr0 = percpu_read(xen_cr0_value);
+
+ if (unlikely(cr0 == 0)) {
+ cr0 = native_read_cr0();
+ percpu_write(xen_cr0_value, cr0);
+ }
+
+ return cr0;
+}
+
static void xen_write_cr0(unsigned long cr0)
{
struct multicall_space mcs;
+ percpu_write(xen_cr0_value, cr0);
+
/* Only pay attention to cr0.TS; everything else is
ignored. */
mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.clts = xen_clts,
- .read_cr0 = native_read_cr0,
+ .read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
/* Xen takes care of %gs when switching to usermode for us */
.swapgs = paravirt_nop,
- .lazy_mode = {
- .enter = paravirt_enter_lazy_cpu,
- .leave = xen_leave_lazy,
- },
+ .start_context_switch = paravirt_start_context_switch,
+ .end_context_switch = xen_end_context_switch,
};
static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index fba55b1a402..4ceb2858165 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- /* updates to init_mm may be done without lock */
- if (mm == &init_mm)
- preempt_disable();
-
ADD_STATS(set_pte_at, 1);
// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
}
xen_set_pte(ptep, pteval);
-out:
- if (mm == &init_mm)
- preempt_enable();
+out: return;
}
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
/* If this cpu still has a stale cr3 reference, then make sure
it has been flushed. */
- if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+ if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
load_cr3(swapper_pg_dir);
- arch_flush_lazy_cpu_mode();
- }
}
static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
load_cr3(swapper_pg_dir);
else
leave_mm(smp_processor_id());
- arch_flush_lazy_cpu_mode();
}
/* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
xen_mark_init_mm_pinned();
}
+static void xen_leave_lazy_mmu(void)
+{
+ preempt_disable();
+ xen_mc_flush();
+ paravirt_leave_lazy_mmu();
+ preempt_enable();
+}
+
const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pagetable_setup_start = xen_pagetable_setup_start,
.pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
.lazy_mode = {
.enter = paravirt_enter_lazy_mmu,
- .leave = xen_leave_lazy,
+ .leave = xen_leave_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a..ad0047f47cd 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
- e820_add_region(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list,
- E820_RESERVED);
+ reserve_early(__pa(xen_start_info->mfn_list),
+ __pa(xen_start_info->pt_base),
+ "XEN START INFO");
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ca6596b05d5..22494fd4c9b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_ident_map_ISA(void);
void xen_reserve_top(void);
-void xen_leave_lazy(void);
void xen_post_allocator_init(void);
char * __init xen_memory_setup(void);