summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-10-10 08:07:53 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-10 08:28:58 -0700
commitd403a6484f0341bf0624d17ece46f24f741b6a92 (patch)
treebe1c2ec69a3caa9f437e4b87ca9cac80e57fbc4d /arch/x86/kernel
parented458df4d2470adc02762a87a9ad665d0b1a2bd4 (diff)
parente496e3d645c93206faf61ff6005995ebd08cc39c (diff)
Merge phase #1 of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
This merges phase 1 of the x86 tree, which is a collection of branches: x86/alternatives, x86/cleanups, x86/commandline, x86/crashdump, x86/debug, x86/defconfig, x86/doc, x86/exports, x86/fpu, x86/gart, x86/idle, x86/mm, x86/mtrr, x86/nmi-watchdog, x86/oprofile, x86/paravirt, x86/reboot, x86/sparse-fixes, x86/tsc, x86/urgent and x86/vmalloc and as Ingo says: "these are the easiest, purely independent x86 topics with no conflicts, in one nice Octopus merge". * 'x86-v28-for-linus-phase1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (147 commits) x86: mtrr_cleanup: treat WRPROT as UNCACHEABLE x86: mtrr_cleanup: first 1M may be covered in var mtrrs x86: mtrr_cleanup: print out correct type v2 x86: trivial printk fix in efi.c x86, debug: mtrr_cleanup print out var mtrr before change it x86: mtrr_cleanup try gran_size to less than 1M, v3 x86: mtrr_cleanup try gran_size to less than 1M, cleanup x86: change MTRR_SANITIZER to def_bool y x86, debug printouts: IOMMU setup failures should not be KERN_ERR x86: export set_memory_ro and set_memory_rw x86: mtrr_cleanup try gran_size to less than 1M x86: mtrr_cleanup prepare to make gran_size to less 1M x86: mtrr_cleanup safe to get more spare regs now x86_64: be less annoying on boot, v2 x86: mtrr_cleanup hole size should be less than half of chunk_size, v2 x86: add mtrr_cleanup_debug command line x86: mtrr_cleanup optimization, v2 x86: don't need to go to chunksize to 4G x86_64: be less annoying on boot x86, olpc: fix endian bug in openfirmware workaround ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/alternative.c8
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/common_64.c51
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c274
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c86
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c3
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-dma.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c27
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/process_32.c62
-rw-r--r--arch/x86/kernel/process_64.c170
-rw-r--r--arch/x86/kernel/ptrace.c480
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/sigframe.h5
-rw-r--r--arch/x86/kernel/signal_32.c12
-rw-r--r--arch/x86/kernel/signal_64.c112
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps_64.c66
-rw-r--r--arch/x86/kernel/tsc.c290
-rw-r--r--arch/x86/kernel/visws_quirks.c16
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c10
52 files changed, 1889 insertions, 943 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c102af85df9..7d40ef7b36e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include <asm/proto.h>
-#include <asm/genapic.h>
#else /* X86 */
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#warning ACPI uses CMPXCHG, i486 and later hardware
#endif
-static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
struct acpi_mcfg_allocation *pci_mmcfg_config;
int pci_mmcfg_config_num;
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
{
if (!strcmp(mcfg->header.oem_id, "SGI"))
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65a0c1b4869..fb04e49776b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+ /* turn DS segment override prefix into lock prefix */
+ text_poke(*ptr, ((unsigned char []){0xf0}), 1);
};
}
static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
u8 **ptr;
- char insn[1];
if (noreplace_smp)
return;
- add_nops(insn, 1);
for (ptr = start; ptr < end; ptr++) {
if (*ptr < text)
continue;
if (*ptr > text_end)
continue;
- text_poke(*ptr, insn, 1);
+ /* turn lock prefix into DS segment override prefix */
+ text_poke(*ptr, ((unsigned char []){0x3E}), 1);
};
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db1..9a32b37ee2e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_ERR
+ printk(KERN_INFO
"Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_ERR
+ printk(KERN_INFO
"This costs you %d MB of RAM\n",
32 << fallback_aper_order);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 732d1f4e10e..5145a6e72bb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,7 +228,6 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
-#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index aa89387006f..505543a75a5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
#define __NO_STUBS 1
#undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
#define __SYSCALL(nr, sym) [nr] = 1,
static char syscalls[] = {
#include <asm/unistd.h>
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391..fdd585f9c53 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
{
const char *str;
switch (status) {
- case 0: str = "Call completed without error"; break;
- case -1: str = "Not implemented"; break;
- case -2: str = "Invalid argument"; break;
- case -3: str = "Call completed with error"; break;
- default: str = "Unknown BIOS status code"; break;
+ case 0: str = "Call completed without error"; break;
+ case -1: str = "Not implemented"; break;
+ case -2: str = "Invalid argument"; break;
+ case -3: str = "Call completed with error"; break;
+ default: str = "Unknown BIOS status code"; break;
}
return str;
}
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index a11f5d4477c..305b465889b 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -430,6 +430,49 @@ static __init int setup_noclflush(char *arg)
}
__setup("noclflush", setup_noclflush);
+struct msr_range {
+ unsigned min;
+ unsigned max;
+};
+
+static struct msr_range msr_range_array[] __cpuinitdata = {
+ { 0x00000000, 0x00000418},
+ { 0xc0000000, 0xc000040b},
+ { 0xc0010000, 0xc0010142},
+ { 0xc0011000, 0xc001103b},
+};
+
+static void __cpuinit print_cpu_msr(void)
+{
+ unsigned index;
+ u64 val;
+ int i;
+ unsigned index_min, index_max;
+
+ for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+ index_min = msr_range_array[i].min;
+ index_max = msr_range_array[i].max;
+ for (index = index_min; index < index_max; index++) {
+ if (rdmsrl_amd_safe(index, &val))
+ continue;
+ printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+ }
+ }
+}
+
+static int show_msr __cpuinitdata;
+static __init int setup_show_msr(char *arg)
+{
+ int num;
+
+ get_option(&arg, &num);
+
+ if (num > 0)
+ show_msr = num;
+ return 1;
+}
+__setup("show_msr=", setup_show_msr);
+
void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
@@ -439,6 +482,14 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT " stepping %02x\n", c->x86_mask);
else
printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+ if (c->cpu_index < show_msr)
+ print_cpu_msr();
+#else
+ if (show_msr)
+ print_cpu_msr();
+#endif
}
static __init int setup_disablecpuid(char *arg)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b75f2569b8f..f113ef4595f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -222,10 +222,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
+ ds_init_intel(c);
}
if (cpu_has_bts)
- ds_init_intel(c);
+ ptrace_bts_init_intel(c);
/*
* See if we have a good local APIC by checking for buggy Pentia,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80e..4e8d77f01ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
tmp |= ~((1<<(hi - 1)) - 1);
if (tmp != mask_lo) {
- static int once = 1;
-
- if (once) {
- printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
- once = 0;
- }
+ WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
mask_lo = tmp;
}
}
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb371..4c4214690dd 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
}
/* RED-PEN: base can be > 32bit */
len += seq_printf(seq,
- "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+ "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
- mtrr_attrib_to_str(type), mtrr_usage_table[i]);
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 885c8265e6b..c78c04821ea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
mtrr_type type;
};
-struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
static int __initdata debug_print;
static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
/* take out UC ranges */
for (i = 0; i < num_var_ranges; i++) {
type = range_state[i].type;
- if (type != MTRR_TYPE_UNCACHABLE)
+ if (type != MTRR_TYPE_UNCACHABLE &&
+ type != MTRR_TYPE_WRPROT)
continue;
size = range_state[i].size_pfn;
if (!size)
@@ -836,6 +837,13 @@ static int __init enable_mtrr_cleanup_setup(char *str)
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+ debug_print = 1;
+ return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
+
struct var_mtrr_state {
unsigned long range_startk;
unsigned long range_sizek;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
}
}
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+ char factor;
+ unsigned long base = sizek;
+
+ if (base & ((1<<10) - 1)) {
+ /* not MB alignment */
+ factor = 'K';
+ } else if (base & ((1<<20) - 1)){
+ factor = 'M';
+ base >>= 10;
+ } else {
+ factor = 'G';
+ base >>= 20;
+ }
+
+ *factorp = factor;
+
+ return base;
+}
+
static unsigned int __init
range_to_mtrr(unsigned int reg, unsigned long range_startk,
unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
align = max_align;
sizek = 1 << align;
- if (debug_print)
+ if (debug_print) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ start_base = to_size_factor(range_startk, &start_factor),
+ size_base = to_size_factor(sizek, &size_factor),
+
printk(KERN_DEBUG "Setting variable MTRR %d, "
- "base: %ldMB, range: %ldMB, type %s\n",
- reg, range_startk >> 10, sizek >> 10,
+ "base: %ld%cB, range: %ld%cB, type %s\n",
+ reg, start_base, start_factor,
+ size_base, size_factor,
(type == MTRR_TYPE_UNCACHABLE)?"UC":
((type == MTRR_TYPE_WRBACK)?"WB":"Other")
);
+ }
save_var_mtrr(reg++, range_startk, sizek, type);
range_startk += sizek;
range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
/* try to append some small hole */
range0_basek = state->range_startk;
range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+ /* no increase */
if (range0_sizek == state->range_sizek) {
if (debug_print)
printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
return 0;
}
- range0_sizek -= chunk_sizek;
- if (range0_sizek && sizek) {
- while (range0_basek + range0_sizek > (basek + sizek)) {
- range0_sizek -= chunk_sizek;
- if (!range0_sizek)
- break;
- }
+ /* only cut back, when it is not the last */
+ if (sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ if (range0_sizek >= chunk_sizek)
+ range0_sizek -= chunk_sizek;
+ else
+ range0_sizek = 0;
+
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+second_try:
+ range_basek = range0_basek + range0_sizek;
+
+ /* one hole in the middle */
+ if (range_basek > basek && range_basek <= (basek + sizek))
+ second_sizek = range_basek - basek;
+
+ if (range0_sizek > state->range_sizek) {
+
+ /* one hole in middle or at end */
+ hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+ /* hole size should be less than half of range0 size */
+ if (hole_sizek >= (range0_sizek >> 1) &&
+ range0_sizek >= chunk_sizek) {
+ range0_sizek -= chunk_sizek;
+ second_sizek = 0;
+ hole_sizek = 0;
+
+ goto second_try;
+ }
}
if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
(range0_basek + range0_sizek)<<10);
state->reg = range_to_mtrr(state->reg, range0_basek,
range0_sizek, MTRR_TYPE_WRBACK);
-
- }
-
- range_basek = range0_basek + range0_sizek;
- range_sizek = chunk_sizek;
-
- if (range_basek + range_sizek > basek &&
- range_basek + range_sizek <= (basek + sizek)) {
- /* one hole */
- second_basek = basek;
- second_sizek = range_basek + range_sizek - basek;
}
- /* if last piece, only could one hole near end */
- if ((second_basek || !basek) &&
- range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
- (chunk_sizek >> 1)) {
- /*
- * one hole in middle (second_sizek is 0) or at end
- * (second_sizek is 0 )
- */
- hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
- - second_sizek;
- hole_basek = range_basek + range_sizek - hole_sizek
- - second_sizek;
- } else {
- /* fallback for big hole, or several holes */
+ if (range0_sizek < state->range_sizek) {
+ /* need to handle left over */
range_sizek = state->range_sizek - range0_sizek;
- second_basek = 0;
- second_sizek = 0;
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n",
+ range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek,
+ range_sizek, MTRR_TYPE_WRBACK);
}
- if (debug_print)
- printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
- (range_basek + range_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
- MTRR_TYPE_WRBACK);
if (hole_sizek) {
+ hole_basek = range_basek - hole_sizek - second_sizek;
if (debug_print)
printk(KERN_DEBUG "hole: %016lx - %016lx\n",
- hole_basek<<10, (hole_basek + hole_sizek)<<10);
- state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
- MTRR_TYPE_UNCACHABLE);
-
+ hole_basek<<10,
+ (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek,
+ hole_sizek, MTRR_TYPE_UNCACHABLE);
}
return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
};
/*
- * gran_size: 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 4G
- * so we need (2+13)*6
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
*/
-#define NUM_RESULT 90
+#define NUM_RESULT 136
#define PSHIFT (PAGE_SHIFT - 10)
static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
static int __init mtrr_cleanup(unsigned address_bits)
{
unsigned long extra_remove_base, extra_remove_size;
- unsigned long i, base, size, def, dummy;
+ unsigned long base, size, def, dummy;
mtrr_type type;
int nr_range, nr_range_new;
u64 chunk_size, gran_size;
unsigned long range_sums, range_sums_new;
int index_good;
int num_reg_good;
+ int i;
/* extra one for all 0 */
int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
continue;
if (!size)
type = MTRR_NUM_TYPES;
+ if (type == MTRR_TYPE_WRPROT)
+ type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
num_var_ranges - num[MTRR_NUM_TYPES])
return 0;
+ /* print original var MTRRs at first, for debugging: */
+ printk(KERN_DEBUG "original variable MTRRs\n");
+ for (i = 0; i < num_var_ranges; i++) {
+ char start_factor = 'K', size_factor = 'K';
+ unsigned long start_base, size_base;
+
+ size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+ if (!size_base)
+ continue;
+
+ size_base = to_size_factor(size_base, &size_factor),
+ start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+ start_base = to_size_factor(start_base, &start_factor),
+ type = range_state[i].type;
+
+ printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+ i, start_base, start_factor,
+ size_base, size_factor,
+ (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+ ((type == MTRR_TYPE_WRPROT) ? "WP" :
+ ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+ );
+ }
+
memset(range, 0, sizeof(range));
extra_remove_size = 0;
- if (mtrr_tom2) {
- extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ if (mtrr_tom2)
extra_remove_size =
(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
- }
nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
extra_remove_size);
+ /*
+ * [0, 1M) should always be coverred by var mtrr with WB
+ * and fixed mtrrs should take effective before var mtrr for it
+ */
+ nr_range = add_range_with_merge(range, nr_range, 0,
+ (1ULL<<(20 - PAGE_SHIFT)) - 1);
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM coverred: %ldM\n",
range_sums >> (20 - PAGE_SHIFT));
if (mtrr_chunk_size && mtrr_gran_size) {
int num_reg;
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
- debug_print = 1;
+ debug_print++;
/* convert ranges to var ranges state */
num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
result[i].lose_cover_sizek =
(range_sums - range_sums_new) << PSHIFT;
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ lose_base, lose_factor);
if (!result[i].bad) {
set_var_mtrr_all(address_bits);
return 1;
}
printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
"will find optimal one\n");
- debug_print = 0;
+ debug_print--;
memset(result, 0, sizeof(result[0]));
}
i = 0;
memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
memset(result, 0, sizeof(result));
- for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
- for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+ char gran_factor;
+ unsigned long gran_base;
+
+ if (debug_print)
+ gran_base = to_size_factor(gran_size >> 10, &gran_factor);
+
+ for (chunk_size = gran_size; chunk_size < (1ULL<<32);
chunk_size <<= 1) {
int num_reg;
- if (debug_print)
- printk(KERN_INFO
- "\ngran_size: %lldM chunk_size_size: %lldM\n",
- gran_size >> 20, chunk_size >> 20);
+ if (debug_print) {
+ char chunk_factor;
+ unsigned long chunk_base;
+
+ chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
+ printk(KERN_INFO "\n");
+ printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ }
if (i >= NUM_RESULT)
continue;
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
/* print out all */
for (i = 0; i < NUM_RESULT; i++) {
- printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
- result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
- result[i].num_reg, result[i].bad?"-":"",
- result[i].lose_cover_sizek >> 10);
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+ result[i].bad?"*BAD*":" ",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ lose_base, lose_factor);
}
/* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
nr_mtrr_spare_reg = num_var_ranges - 1;
num_reg_good = -1;
for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
- if (!min_loss_pfn[i]) {
+ if (!min_loss_pfn[i])
num_reg_good = i;
- break;
- }
}
index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
}
if (index_good != -1) {
+ char gran_factor, chunk_factor, lose_factor;
+ unsigned long gran_base, chunk_base, lose_base;
+
printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
i = index_good;
- printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
- result[i].gran_sizek >> 10,
- result[i].chunk_sizek >> 10);
- printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
- result[i].num_reg,
- result[i].lose_cover_sizek >> 10);
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
+ gran_base, gran_factor, chunk_base, chunk_factor);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
+ result[i].num_reg, lose_base, lose_factor);
/* convert ranges to var ranges state */
chunk_size = result[i].chunk_sizek;
chunk_size <<= 10;
gran_size = result[i].gran_sizek;
gran_size <<= 10;
- debug_print = 1;
+ debug_print++;
x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ debug_print--;
set_var_mtrr_all(address_bits);
return 1;
}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4f..6bff382094f 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
/* setup the timer */
wrmsr(evntsel_msr, evntsel, 0);
write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
+ /* initialize the wd struct before enabling */
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
return 1;
}
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
#define P4_CCCR_ENABLE (1 << 12)
#define P4_CCCR_OVF (1 << 31)
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+ MSR_P4_BPU_CCCR0,
+ MSR_P4_BPU_CCCR1,
+ MSR_P4_BPU_CCCR2,
+ MSR_P4_BPU_CCCR3,
+ MSR_P4_MS_CCCR0,
+ MSR_P4_MS_CCCR1,
+ MSR_P4_MS_CCCR2,
+ MSR_P4_MS_CCCR3,
+ MSR_P4_FLAME_CCCR0,
+ MSR_P4_FLAME_CCCR1,
+ MSR_P4_FLAME_CCCR2,
+ MSR_P4_FLAME_CCCR3,
+ MSR_P4_IQ_CCCR0,
+ MSR_P4_IQ_CCCR1,
+ MSR_P4_IQ_CCCR2,
+ MSR_P4_IQ_CCCR3,
+ MSR_P4_IQ_CCCR4,
+ MSR_P4_IQ_CCCR5,
+};
/*
* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
* CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
evntsel_msr = MSR_P4_CRU_ESCR0;
cccr_msr = MSR_P4_IQ_CCCR0;
cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+ /*
+ * If we're on the kdump kernel or other situation, we may
+ * still have other performance counter registers set to
+ * interrupt and they'll keep interrupting forever because
+ * of the P4_CCCR_OVF quirk. So we need to ACK all the
+ * pending interrupts and disable all the registers here,
+ * before reenabling the NMI delivery. Refer to p4_rearm()
+ * about the P4_CCCR_OVF quirk.
+ */
+ if (reset_devices) {
+ unsigned int low, high;
+ int i;
+
+ for (i = 0; i < P4_CONTROLS; i++) {
+ rdmsr(p4_controls[i], low, high);
+ low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+ wrmsr(p4_controls[i], low, high);
+ }
+ }
} else {
/* logical cpu 1 */
perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
wrmsr(cccr_msr, cccr_val, 0);
write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
+
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = cccr_msr;
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
return 1;
}
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
wrmsr(evntsel_msr, evntsel, 0);
nmi_hz = adjust_for_32bit_ctr(nmi_hz);
write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
wd->perfctr_msr = perfctr_msr;
wd->evntsel_msr = evntsel_msr;
wd->cccr_msr = 0; /* unused */
+
+ /* ok, everything is initialized, announce that we're set */
+ cpu_nmi_set_wd_enabled();
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
return 1;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec1..6a44d646599 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
-#include <linux/smp_lock.h>
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a4..e90a60ef10c 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
#include <linux/errno.h>
#include <linux/crash_dump.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
/**
* copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ size_t csize, unsigned long offset, int userbuf)
{
void *vaddr;
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
return 0;
vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+ if (!vaddr)
+ return -ENOMEM;
if (userbuf) {
- if (copy_to_user(buf, (vaddr + offset), csize)) {
+ if (copy_to_user(buf, vaddr + offset, csize)) {
iounmap(vaddr);
return -EFAULT;
}
} else
- memcpy(buf, (vaddr + offset), csize);
+ memcpy(buf, vaddr + offset, csize);
iounmap(vaddr);
return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48..2b69994fd3a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
* Debug Store support
*
* This provides a low-level interface to the hardware's Debug Store
- * feature that is used for last branch recording (LBR) and
+ * feature that is used for branch trace store (BTS) and
* precise-event based sampling (PEBS).
*
- * Different architectures use a different DS layout/pointer size.
- * The below functions therefore work on a void*.
+ * It manages:
+ * - per-thread and per-cpu allocation of BTS and PEBS
+ * - buffer memory allocation (optional)
+ * - buffer overflow handling
+ * - buffer access
*
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
*
- * Since there is no user for PEBS, yet, only LBR (or branch
- * trace store, BTS) is supported.
*
- *
- * Copyright (C) 2007 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
*/
+
+#ifdef CONFIG_X86_DS
+
#include <asm/ds.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+
+/*
+ * The configuration for a particular DS hardware implementation.
+ */
+struct ds_configuration {
+ /* the size of the DS structure in bytes */
+ unsigned char sizeof_ds;
+ /* the size of one pointer-typed field in the DS structure in bytes;
+ this covers the first 8 fields related to buffer management. */
+ unsigned char sizeof_field;
+ /* the size of a BTS/PEBS record in bytes */
+ unsigned char sizeof_rec[2];
+};
+static struct ds_configuration ds_cfg;
/*
@@ -44,378 +67,747 @@
* (interrupt occurs when write pointer passes interrupt pointer)
* - value to which counter is reset following counter overflow
*
- * On later architectures, the last branch recording hardware uses
- * 64bit pointers even in 32bit mode.
- *
- *
- * Branch Trace Store (BTS) records store information about control
- * flow changes. They at least provide the following information:
- * - source linear address
- * - destination linear address
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
*
- * Netburst supported a predicated bit that had been dropped in later
- * architectures. We do not suppor it.
*
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ * - an offset giving the start of the respective region
*
- * In order to abstract from the actual DS and BTS layout, we describe
- * the access to the relevant fields.
- * Thanks to Andi Kleen for proposing this design.
+ * This offset is further used to index various arrays holding
+ * information for BTS and PEBS at the respective index.
*
- * The implementation, however, is not as general as it might seem. In
- * order to stay somewhat simple and efficient, we assume an
- * underlying unsigned type (mostly a pointer type) and we expect the
- * field to be at least as big as that type.
+ * On later 32bit processors, we only access the lower 32bit of the
+ * 64bit pointer fields. The upper halves will be zeroed out.
*/
-/*
- * A special from_ip address to indicate that the BTS record is an
- * info record that needs to be interpreted or skipped.
- */
-#define BTS_ESCAPE_ADDRESS (-1)
+enum ds_field {
+ ds_buffer_base = 0,
+ ds_index,
+ ds_absolute_maximum,
+ ds_interrupt_threshold,
+};
-/*
- * A field access descriptor
- */
-struct access_desc {
- unsigned char offset;
- unsigned char size;
+enum ds_qualifier {
+ ds_bts = 0,
+ ds_pebs
};
+static inline unsigned long ds_get(const unsigned char *base,
+ enum ds_qualifier qual, enum ds_field field)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ return *(unsigned long *)base;
+}
+
+static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
+ enum ds_field field, unsigned long value)
+{
+ base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+ (*(unsigned long *)base) = value;
+}
+
+
/*
- * The configuration for a particular DS/BTS hardware implementation.
+ * Locking is done only for allocating BTS or PEBS resources and for
+ * guarding context and buffer memory allocation.
+ *
+ * Most functions require the current task to own the ds context part
+ * they are going to access. All the locking is done when validating
+ * access to the context.
*/
-struct ds_configuration {
- /* the DS configuration */
- unsigned char sizeof_ds;
- struct access_desc bts_buffer_base;
- struct access_desc bts_index;
- struct access_desc bts_absolute_maximum;
- struct access_desc bts_interrupt_threshold;
- /* the BTS configuration */
- unsigned char sizeof_bts;
- struct access_desc from_ip;
- struct access_desc to_ip;
- /* BTS variants used to store additional information like
- timestamps */
- struct access_desc info_type;
- struct access_desc info_data;
- unsigned long debugctl_mask;
-};
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
/*
- * The global configuration used by the below accessor functions
+ * Validate that the current task is allowed to access the BTS/PEBS
+ * buffer of the parameter task.
+ *
+ * Returns 0, if access is granted; -Eerrno, otherwise.
*/
-static struct ds_configuration ds_cfg;
+static inline int ds_validate_access(struct ds_context *context,
+ enum ds_qualifier qual)
+{
+ if (!context)
+ return -EPERM;
+
+ if (context->owner[qual] == current)
+ return 0;
+
+ return -EPERM;
+}
+
/*
- * Accessor functions for some DS and BTS fields using the above
- * global ptrace_bts_cfg.
+ * We either support (system-wide) per-cpu or per-thread allocation.
+ * We distinguish the two based on the task_struct pointer, where a
+ * NULL pointer indicates per-cpu allocation for the current cpu.
+ *
+ * Allocations are use-counted. As soon as resources are allocated,
+ * further allocations must be of the same type (per-cpu or
+ * per-thread). We model this by counting allocations (i.e. the number
+ * of tracers of a certain type) for one type negatively:
+ * =0 no tracers
+ * >0 number of per-thread tracers
+ * <0 number of per-cpu tracers
+ *
+ * The below functions to get and put tracers and to check the
+ * allocation type require the ds_lock to be held by the caller.
+ *
+ * Tracers essentially gives the number of ds contexts for a certain
+ * type of allocation.
*/
-static inline unsigned long get_bts_buffer_base(char *base)
+static long tracers;
+
+static inline void get_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
+ tracers += (task ? 1 : -1);
}
-static inline void set_bts_buffer_base(char *base, unsigned long value)
+
+static inline void put_tracer(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
+ tracers -= (task ? 1 : -1);
}
-static inline unsigned long get_bts_index(char *base)
+
+static inline int check_tracer(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_index.offset);
+ return (task ? (tracers >= 0) : (tracers <= 0));
}
-static inline void set_bts_index(char *base, unsigned long value)
+
+
+/*
+ * The DS context is either attached to a thread or to a cpu:
+ * - in the former case, the thread_struct contains a pointer to the
+ * attached context.
+ * - in the latter case, we use a static array of per-cpu context
+ * pointers.
+ *
+ * Contexts are use-counted. They are allocated on first access and
+ * deallocated when the last user puts the context.
+ *
+ * We distinguish between an allocating and a non-allocating get of a
+ * context:
+ * - the allocating get is used for requesting BTS/PEBS resources. It
+ * requires the caller to hold the global ds_lock.
+ * - the non-allocating get is used for all other cases. A
+ * non-existing context indicates an error. It acquires and releases
+ * the ds_lock itself for obtaining the context.
+ *
+ * A context and its DS configuration are allocated and deallocated
+ * together. A context always has a DS configuration of the
+ * appropriate size.
+ */
+static DEFINE_PER_CPU(struct ds_context *, system_context);
+
+#define this_system_context per_cpu(system_context, smp_processor_id())
+
+/*
+ * Returns the pointer to the parameter task's context or to the
+ * system-wide context, if task is NULL.
+ *
+ * Increases the use count of the returned context, if not NULL.
+ */
+static inline struct ds_context *ds_get_context(struct task_struct *task)
{
- (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
+ struct ds_context *context;
+
+ spin_lock(&ds_lock);
+
+ context = (task ? task->thread.ds_ctx : this_system_context);
+ if (context)
+ context->count++;
+
+ spin_unlock(&ds_lock);
+
+ return context;
}
-static inline unsigned long get_bts_absolute_maximum(char *base)
+
+/*
+ * Same as ds_get_context, but allocates the context and it's DS
+ * structure, if necessary; returns NULL; if out of memory.
+ *
+ * pre: requires ds_lock to be held
+ */
+static inline struct ds_context *ds_alloc_context(struct task_struct *task)
{
- return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
+ struct ds_context **p_context =
+ (task ? &task->thread.ds_ctx : &this_system_context);
+ struct ds_context *context = *p_context;
+
+ if (!context) {
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+
+ if (!context)
+ return NULL;
+
+ context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ if (!context->ds) {
+ kfree(context);
+ return NULL;
+ }
+
+ *p_context = context;
+
+ context->this = p_context;
+ context->task = task;
+
+ if (task)
+ set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
+
+ if (!task || (task == current))
+ wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+
+ get_tracer(task);
+ }
+
+ context->count++;
+
+ return context;
}
-static inline void set_bts_absolute_maximum(char *base, unsigned long value)
+
+/*
+ * Decreases the use count of the parameter context, if not NULL.
+ * Deallocates the context, if the use count reaches zero.
+ */
+static inline void ds_put_context(struct ds_context *context)
{
- (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+ if (!context)
+ return;
+
+ spin_lock(&ds_lock);
+
+ if (--context->count)
+ goto out;
+
+ *(context->this) = NULL;
+
+ if (context->task)
+ clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+
+ if (!context->task || (context->task == current))
+ wrmsrl(MSR_IA32_DS_AREA, 0);
+
+ put_tracer(context->task);
+
+ /* free any leftover buffers from tracers that did not
+ * deallocate them properly. */
+ kfree(context->buffer[ds_bts]);
+ kfree(context->buffer[ds_pebs]);
+ kfree(context->ds);
+ kfree(context);
+ out:
+ spin_unlock(&ds_lock);
}
-static inline unsigned long get_bts_interrupt_threshold(char *base)
+
+
+/*
+ * Handle a buffer overflow
+ *
+ * task: the task whose buffers are overflowing;
+ * NULL for a buffer overflow on the current cpu
+ * context: the ds context
+ * qual: the buffer type
+ */
+static void ds_overflow(struct task_struct *task, struct ds_context *context,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
+ if (!context)
+ return;
+
+ if (context->callback[qual])
+ (*context->callback[qual])(task);
+
+ /* todo: do some more overflow handling */
}
-static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
+
+
+/*
+ * Allocate a non-pageable buffer of the parameter size.
+ * Checks the memory and the locked memory rlimit.
+ *
+ * Returns the buffer, if successful;
+ * NULL, if out of memory or rlimit exceeded.
+ *
+ * size: the requested buffer size in bytes
+ * pages (out): if not NULL, contains the number of pages reserved
+ */
+static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
{
- (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+ unsigned long rlim, vm, pgsz;
+ void *buffer;
+
+ pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->total_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+ vm = current->mm->locked_vm + pgsz;
+ if (rlim < vm)
+ return NULL;
+
+ buffer = kzalloc(size, GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ current->mm->total_vm += pgsz;
+ current->mm->locked_vm += pgsz;
+
+ if (pages)
+ *pages = pgsz;
+
+ return buffer;
}
-static inline unsigned long get_from_ip(char *base)
+
+static int ds_request(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.from_ip.offset);
+ struct ds_context *context;
+ unsigned long buffer, adj;
+ const unsigned long alignment = (1 << 3);
+ int error = 0;
+
+ if (!ds_cfg.sizeof_ds)
+ return -EOPNOTSUPP;
+
+ /* we require some space to do alignment adjustments below */
+ if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+ return -EINVAL;
+
+ /* buffer overflow notification is not yet implemented */
+ if (ovfl)
+ return -EOPNOTSUPP;
+
+
+ spin_lock(&ds_lock);
+
+ if (!check_tracer(task))
+ return -EPERM;
+
+ error = -ENOMEM;
+ context = ds_alloc_context(task);
+ if (!context)
+ goto out_unlock;
+
+ error = -EALREADY;
+ if (context->owner[qual] == current)
+ goto out_unlock;
+ error = -EPERM;
+ if (context->owner[qual] != NULL)
+ goto out_unlock;
+ context->owner[qual] = current;
+
+ spin_unlock(&ds_lock);
+
+
+ error = -ENOMEM;
+ if (!base) {
+ base = ds_allocate_buffer(size, &context->pages[qual]);
+ if (!base)
+ goto out_release;
+
+ context->buffer[qual] = base;
+ }
+ error = 0;
+
+ context->callback[qual] = ovfl;
+
+ /* adjust the buffer address and size to meet alignment
+ * constraints:
+ * - buffer is double-word aligned
+ * - size is multiple of record size
+ *
+ * We checked the size at the very beginning; we have enough
+ * space to do the adjustment.
+ */
+ buffer = (unsigned long)base;
+
+ adj = ALIGN(buffer, alignment) - buffer;
+ buffer += adj;
+ size -= adj;
+
+ size /= ds_cfg.sizeof_rec[qual];
+ size *= ds_cfg.sizeof_rec[qual];
+
+ ds_set(context->ds, qual, ds_buffer_base, buffer);
+ ds_set(context->ds, qual, ds_index, buffer);
+ ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+
+ if (ovfl) {
+ /* todo: select a suitable interrupt threshold */
+ } else
+ ds_set(context->ds, qual,
+ ds_interrupt_threshold, buffer + size + 1);
+
+ /* we keep the context until ds_release */
+ return error;
+
+ out_release:
+ context->owner[qual] = NULL;
+ ds_put_context(context);
+ return error;
+
+ out_unlock:
+ spin_unlock(&ds_lock);
+ ds_put_context(context);
+ return error;
}
-static inline void set_from_ip(char *base, unsigned long value)
+
+int ds_request_bts(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
+ return ds_request(task, base, size, ovfl, ds_bts);
}
-static inline unsigned long get_to_ip(char *base)
+
+int ds_request_pebs(struct task_struct *task, void *base, size_t size,
+ ds_ovfl_callback_t ovfl)
{
- return *(unsigned long *)(base + ds_cfg.to_ip.offset);
+ return ds_request(task, base, size, ovfl, ds_pebs);
}
-static inline void set_to_ip(char *base, unsigned long value)
+
+static int ds_release(struct task_struct *task, enum ds_qualifier qual)
{
- (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
+ struct ds_context *context;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ kfree(context->buffer[qual]);
+ context->buffer[qual] = NULL;
+
+ current->mm->total_vm -= context->pages[qual];
+ current->mm->locked_vm -= context->pages[qual];
+ context->pages[qual] = 0;
+ context->owner[qual] = NULL;
+
+ /*
+ * we put the context twice:
+ * once for the ds_get_context
+ * once for the corresponding ds_request
+ */
+ ds_put_context(context);
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline unsigned char get_info_type(char *base)
+
+int ds_release_bts(struct task_struct *task)
{
- return *(unsigned char *)(base + ds_cfg.info_type.offset);
+ return ds_release(task, ds_bts);
}
-static inline void set_info_type(char *base, unsigned char value)
+
+int ds_release_pebs(struct task_struct *task)
{
- (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+ return ds_release(task, ds_pebs);
}
-static inline unsigned long get_info_data(char *base)
+
+static int ds_get_index(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- return *(unsigned long *)(base + ds_cfg.info_data.offset);
+ struct ds_context *context;
+ unsigned long base, index;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+
+ error = ((index - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
}
-static inline void set_info_data(char *base, unsigned long value)
+
+int ds_get_bts_index(struct task_struct *task, size_t *pos)
{
- (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
+ return ds_get_index(task, pos, ds_bts);
}
+int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+{
+ return ds_get_index(task, pos, ds_pebs);
+}
-int ds_allocate(void **dsp, size_t bts_size_in_bytes)
+static int ds_get_end(struct task_struct *task, size_t *pos,
+ enum ds_qualifier qual)
{
- size_t bts_size_in_records;
- unsigned long bts;
- void *ds;
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
+
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+
+ error = ((end - base) / ds_cfg.sizeof_rec[qual]);
+ if (pos)
+ *pos = error;
+ out:
+ ds_put_context(context);
+ return error;
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+int ds_get_bts_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_bts);
+}
- if (bts_size_in_bytes < 0)
- return -EINVAL;
+int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+{
+ return ds_get_end(task, pos, ds_pebs);
+}
- bts_size_in_records =
- bts_size_in_bytes / ds_cfg.sizeof_bts;
- bts_size_in_bytes =
- bts_size_in_records * ds_cfg.sizeof_bts;
+static int ds_access(struct task_struct *task, size_t index,
+ const void **record, enum ds_qualifier qual)
+{
+ struct ds_context *context;
+ unsigned long base, idx;
+ int error;
- if (bts_size_in_bytes <= 0)
+ if (!record)
return -EINVAL;
- bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
-
- if (!bts)
- return -ENOMEM;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ idx = base + (index * ds_cfg.sizeof_rec[qual]);
- if (!ds) {
- kfree((void *)bts);
- return -ENOMEM;
- }
-
- set_bts_buffer_base(ds, bts);
- set_bts_index(ds, bts);
- set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
- set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+ error = -EINVAL;
+ if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
+ goto out;
- *dsp = ds;
- return 0;
+ *record = (const void *)idx;
+ error = ds_cfg.sizeof_rec[qual];
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_free(void **dsp)
+int ds_access_bts(struct task_struct *task, size_t index, const void **record)
{
- if (*dsp) {
- kfree((void *)get_bts_buffer_base(*dsp));
- kfree(*dsp);
- *dsp = NULL;
- }
- return 0;
+ return ds_access(task, index, record, ds_bts);
}
-int ds_get_bts_size(void *ds)
+int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
{
- int size_in_bytes;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (!ds)
- return 0;
-
- size_in_bytes =
- get_bts_absolute_maximum(ds) -
- get_bts_buffer_base(ds);
- return size_in_bytes;
+ return ds_access(task, index, record, ds_pebs);
}
-int ds_get_bts_end(void *ds)
+static int ds_write(struct task_struct *task, const void *record, size_t size,
+ enum ds_qualifier qual, int force)
{
- int size_in_bytes = ds_get_bts_size(ds);
-
- if (size_in_bytes <= 0)
- return size_in_bytes;
+ struct ds_context *context;
+ int error;
- return size_in_bytes / ds_cfg.sizeof_bts;
-}
+ if (!record)
+ return -EINVAL;
-int ds_get_bts_index(void *ds)
-{
- int index_offset_in_bytes;
+ error = -EPERM;
+ context = ds_get_context(task);
+ if (!context)
+ goto out;
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+ if (!force) {
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
+ }
- index_offset_in_bytes =
- get_bts_index(ds) -
- get_bts_buffer_base(ds);
+ error = 0;
+ while (size) {
+ unsigned long base, index, end, write_end, int_th;
+ unsigned long write_size, adj_write_size;
+
+ /*
+ * write as much as possible without producing an
+ * overflow interrupt.
+ *
+ * interrupt_threshold must either be
+ * - bigger than absolute_maximum or
+ * - point to a record between buffer_base and absolute_maximum
+ *
+ * index points to a valid record.
+ */
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ index = ds_get(context->ds, qual, ds_index);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
+ int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+
+ write_end = min(end, int_th);
+
+ /* if we are already beyond the interrupt threshold,
+ * we fill the entire buffer */
+ if (write_end <= index)
+ write_end = end;
+
+ if (write_end <= index)
+ goto out;
+
+ write_size = min((unsigned long) size, write_end - index);
+ memcpy((void *)index, record, write_size);
+
+ record = (const char *)record + write_size;
+ size -= write_size;
+ error += write_size;
+
+ adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+ adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+ /* zero out trailing bytes */
+ memset((char *)index + write_size, 0,
+ adj_write_size - write_size);
+ index += adj_write_size;
+
+ if (index >= end)
+ index = base;
+ ds_set(context->ds, qual, ds_index, index);
+
+ if (index >= int_th)
+ ds_overflow(task, context, qual);
+ }
- return index_offset_in_bytes / ds_cfg.sizeof_bts;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_set_overflow(void *ds, int method)
+int ds_write_bts(struct task_struct *task, const void *record, size_t size)
{
- switch (method) {
- case DS_O_SIGNAL:
- return -EOPNOTSUPP;
- case DS_O_WRAP:
- return 0;
- default:
- return -EINVAL;
- }
+ return ds_write(task, record, size, ds_bts, /* force = */ 0);
}
-int ds_get_overflow(void *ds)
+int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
{
- return DS_O_WRAP;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 0);
}
-int ds_clear(void *ds)
+int ds_unchecked_write_bts(struct task_struct *task,
+ const void *record, size_t size)
{
- int bts_size = ds_get_bts_size(ds);
- unsigned long bts_base;
-
- if (bts_size <= 0)
- return bts_size;
-
- bts_base = get_bts_buffer_base(ds);
- memset((void *)bts_base, 0, bts_size);
-
- set_bts_index(ds, bts_base);
- return 0;
+ return ds_write(task, record, size, ds_bts, /* force = */ 1);
}
-int ds_read_bts(void *ds, int index, struct bts_struct *out)
+int ds_unchecked_write_pebs(struct task_struct *task,
+ const void *record, size_t size)
{
- void *bts;
+ return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+}
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
+static int ds_reset_or_clear(struct task_struct *task,
+ enum ds_qualifier qual, int clear)
+{
+ struct ds_context *context;
+ unsigned long base, end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ context = ds_get_context(task);
+ error = ds_validate_access(context, qual);
+ if (error < 0)
+ goto out;
- if (index >= ds_get_bts_size(ds))
- return -EINVAL;
+ base = ds_get(context->ds, qual, ds_buffer_base);
+ end = ds_get(context->ds, qual, ds_absolute_maximum);
- bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
+ if (clear)
+ memset((void *)base, 0, end - base);
- memset(out, 0, sizeof(*out));
- if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
- out->qualifier = get_info_type(bts);
- out->variant.jiffies = get_info_data(bts);
- } else {
- out->qualifier = BTS_BRANCH;
- out->variant.lbr.from_ip = get_from_ip(bts);
- out->variant.lbr.to_ip = get_to_ip(bts);
- }
+ ds_set(context->ds, qual, ds_index, base);
- return sizeof(*out);;
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-int ds_write_bts(void *ds, const struct bts_struct *in)
+int ds_reset_bts(struct task_struct *task)
{
- unsigned long bts;
-
- if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
- return -EOPNOTSUPP;
-
- if (ds_get_bts_size(ds) <= 0)
- return -ENXIO;
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+}
- bts = get_bts_index(ds);
+int ds_reset_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+}
- memset((void *)bts, 0, ds_cfg.sizeof_bts);
- switch (in->qualifier) {
- case BTS_INVALID:
- break;
+int ds_clear_bts(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+}
- case BTS_BRANCH:
- set_from_ip((void *)bts, in->variant.lbr.from_ip);
- set_to_ip((void *)bts, in->variant.lbr.to_ip);
- break;
+int ds_clear_pebs(struct task_struct *task)
+{
+ return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+}
- case BTS_TASK_ARRIVES:
- case BTS_TASK_DEPARTS:
- set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
- set_info_type((void *)bts, in->qualifier);
- set_info_data((void *)bts, in->variant.jiffies);
- break;
+int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+{
+ struct ds_context *context;
+ int error;
- default:
+ if (!value)
return -EINVAL;
- }
- bts = bts + ds_cfg.sizeof_bts;
- if (bts >= get_bts_absolute_maximum(ds))
- bts = get_bts_buffer_base(ds);
- set_bts_index(ds, bts);
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
- return ds_cfg.sizeof_bts;
+ *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
}
-unsigned long ds_debugctl_mask(void)
+int ds_set_pebs_reset(struct task_struct *task, u64 value)
{
- return ds_cfg.debugctl_mask;
-}
+ struct ds_context *context;
+ int error;
-#ifdef __i386__
-static const struct ds_configuration ds_cfg_netburst = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<2)|(1<<3)
-};
+ context = ds_get_context(task);
+ error = ds_validate_access(context, ds_pebs);
+ if (error < 0)
+ goto out;
-static const struct ds_configuration ds_cfg_pentium_m = {
- .sizeof_ds = 9 * 4,
- .bts_buffer_base = { 0, 4 },
- .bts_index = { 4, 4 },
- .bts_absolute_maximum = { 8, 4 },
- .bts_interrupt_threshold = { 12, 4 },
- .sizeof_bts = 3 * 4,
- .from_ip = { 0, 4 },
- .to_ip = { 4, 4 },
- .info_type = { 4, 1 },
- .info_data = { 8, 4 },
- .debugctl_mask = (1<<6)|(1<<7)
+ *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+ error = 0;
+ out:
+ ds_put_context(context);
+ return error;
+}
+
+static const struct ds_configuration ds_cfg_var = {
+ .sizeof_ds = sizeof(long) * 12,
+ .sizeof_field = sizeof(long),
+ .sizeof_rec[ds_bts] = sizeof(long) * 3,
+ .sizeof_rec[ds_pebs] = sizeof(long) * 10
};
-#endif /* _i386_ */
-
-static const struct ds_configuration ds_cfg_core2 = {
- .sizeof_ds = 9 * 8,
- .bts_buffer_base = { 0, 8 },
- .bts_index = { 8, 8 },
- .bts_absolute_maximum = { 16, 8 },
- .bts_interrupt_threshold = { 24, 8 },
- .sizeof_bts = 3 * 8,
- .from_ip = { 0, 8 },
- .to_ip = { 8, 8 },
- .info_type = { 8, 1 },
- .info_data = { 16, 8 },
- .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+static const struct ds_configuration ds_cfg_64 = {
+ .sizeof_ds = 8 * 12,
+ .sizeof_field = 8,
+ .sizeof_rec[ds_bts] = 8 * 3,
+ .sizeof_rec[ds_pebs] = 8 * 10
};
static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
switch (c->x86) {
case 0x6:
switch (c->x86_model) {
-#ifdef __i386__
case 0xD:
case 0xE: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
case 0xF: /* Core2 */
- ds_configure(&ds_cfg_core2);
+ case 0x1C: /* Atom */
+ ds_configure(&ds_cfg_64);
break;
default:
/* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
case 0xF:
switch (c->x86_model) {
-#ifdef __i386__
case 0x0:
case 0x1:
case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst);
+ ds_configure(&ds_cfg_var);
break;
-#endif /* _i386_ */
default:
/* sorry, don't know about them */
break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
break;
}
}
+
+void ds_free(struct ds_context *context)
+{
+ /* This is called when the task owning the parameter context
+ * is dying. There should not be any user of that context left
+ * to disturb us, anymore. */
+ unsigned long leftovers = context->count;
+ while (leftovers--)
+ ds_put_context(context);
+}
+#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b..945a31cdd81 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
if (memmap.map == NULL)
printk(KERN_ERR "Could not map the EFI memory map!\n");
memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
if (memmap.desc_size != sizeof(efi_memory_desc_t))
- printk(KERN_WARNING "Kernel-defined memdesc"
- "doesn't match the one from EFI!\n");
+ printk(KERN_WARNING
+ "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
if (add_efi_memmap)
do_add_efi_memmap();
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89434d43960..cf3a0b2d005 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64)
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
push kernel_eflags(%rip)
- CFI_ADJUST_CFA_OFFSET 4
+ CFI_ADJUST_CFA_OFFSET 8
popf # reset kernel eflags
- CFI_ADJUST_CFA_OFFSET -4
+ CFI_ADJUST_CFA_OFFSET -8
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2..d16084f9064 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)&idt_descr);
- early_printk("Kernel alive\n");
+ if (console_loglevel == 10)
+ early_printk("Kernel alive\n");
x86_64_init_pda();
- early_printk("Kernel really alive\n");
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c8..19191430274 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/thread_info.h>
#include <linux/syscalls.h>
+#include <asm/syscalls.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 3f7537b669d..f1c688e46f3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
#ifdef CONFIG_X86_32
#include <mach_apic.h>
+#include <mach_ipi.h>
+
/*
* the following functions deal with sending IPIs between CPUs.
*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
}
/* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
static int convert_apicid_to_cpu(int apic_id)
{
int i;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc08..b71e02d42f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d..f065fe9071b 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
seq_printf(p, "CAL: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
- seq_printf(p, " function call interrupts\n");
+ seq_printf(p, " Function call interrupts\n");
seq_printf(p, "TLB: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2..478bca986ec 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
}
-static void kvm_release_pt(u32 pfn)
+static void kvm_release_pt(unsigned long pfn)
{
struct kvm_mmu_op_release_pt rpt = {
.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b68e21f06f4..0ed5f939b90 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
#include <asm/ldt.h>
#include <asm/desc.h>
#include <asm/mmu_context.h>
+#include <asm/syscalls.h>
#ifdef CONFIG_SMP
static void flush_ldt(void *current_mm)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4a..2c97f07f1c2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
on_each_cpu(__acpi_nmi_disable, NULL, 1);
}
+/*
+ * This function is called as soon the LAPIC NMI watchdog driver has everything
+ * in place and it's ready to check if the NMIs belong to the NMI watchdog
+ */
+void cpu_nmi_set_wd_enabled(void)
+{
+ __get_cpu_var(wd_enabled) = 1;
+}
+
void setup_apic_nmi_watchdog(void *unused)
{
if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
switch (nmi_watchdog) {
case NMI_LOCAL_APIC:
- /* enable it before to avoid race with handler */
- __get_cpu_var(wd_enabled) = 1;
if (lapic_watchdog_init(nmi_hz) < 0) {
__get_cpu_var(wd_enabled) = 0;
return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e667227480..7a13fac63a1 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
static void __init platform_detect(void)
{
size_t propsize;
- u32 rev;
+ __be32 rev;
if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
&propsize) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
- rev = 0;
+ rev = cpu_to_be32(0);
}
olpc_platform_info.boardrev = be32_to_cpu(rev);
}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
static void __init platform_detect(void)
{
/* stopgap until OFW support is added to the kernel */
- olpc_platform_info.boardrev = be32_to_cpu(0xc2);
+ olpc_platform_info.boardrev = 0xc2;
}
#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 300da17e61c..e2f43768723 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -330,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
#endif
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
+ .read_msr_amd = native_read_msr_amd_safe,
.write_msr = native_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781..9fe644f4861 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
start = start_##ops##_##x; \
end = end_##ops##_##x; \
goto patch_site
- switch(type) {
+ switch (type) {
PATCH_SITE(pv_irq_ops, irq_disable);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec..f704cb51ff8 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -82,7 +82,7 @@ void __init dma32_reserve_bootmem(void)
* using 512M as goal
*/
align = 64ULL<<20;
- size = round_up(dma32_bootmem_size, align);
+ size = roundup(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
512ULL<<20);
if (dma32_bootmem_ptr)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index be33a5442d8..1a895a58253 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -82,7 +82,8 @@ AGPEXTERN __u32 *agp_gatt_table;
static unsigned long next_bit; /* protected by iommu_bitmap_lock */
static int need_flush; /* global flush state. set for each gart wrap */
-static unsigned long alloc_iommu(struct device *dev, int size)
+static unsigned long alloc_iommu(struct device *dev, int size,
+ unsigned long align_mask)
{
unsigned long offset, flags;
unsigned long boundary_size;
@@ -90,16 +91,17 @@ static unsigned long alloc_iommu(struct device *dev, int size)
base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
PAGE_SIZE) >> PAGE_SHIFT;
- boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
spin_lock_irqsave(&iommu_bitmap_lock, flags);
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size, align_mask);
if (offset == -1) {
need_flush = 1;
offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
- size, base_index, boundary_size, 0);
+ size, base_index, boundary_size,
+ align_mask);
}
if (offset != -1) {
next_bit = offset+size;
@@ -236,10 +238,10 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
* Caller needs to check if the iommu is needed and flush.
*/
static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
- size_t size, int dir)
+ size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size);
- unsigned long iommu_page = alloc_iommu(dev, npages);
+ unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
int i;
if (iommu_page == -1) {
@@ -262,7 +264,11 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
static dma_addr_t
gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
{
- dma_addr_t map = dma_map_area(dev, paddr, size, dir);
+ dma_addr_t map;
+ unsigned long align_mask;
+
+ align_mask = (1UL << get_order(size)) - 1;
+ map = dma_map_area(dev, paddr, size, dir, align_mask);
flush_gart();
@@ -281,7 +287,8 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
if (!need_iommu(dev, paddr, size))
return paddr;
- bus = gart_map_simple(dev, paddr, size, dir);
+ bus = dma_map_area(dev, paddr, size, dir, 0);
+ flush_gart();
return bus;
}
@@ -340,7 +347,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
unsigned long addr = sg_phys(s);
if (nonforced_iommu(dev, addr, s->length)) {
- addr = dma_map_area(dev, addr, s->length, dir);
+ addr = dma_map_area(dev, addr, s->length, dir, 0);
if (addr == bad_dma_address) {
if (i > 0)
gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +369,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int nelems, struct scatterlist *sout,
unsigned long pages)
{
- unsigned long iommu_start = alloc_iommu(dev, pages);
+ unsigned long iommu_start = alloc_iommu(dev, pages, 0);
unsigned long iommu_page = iommu_start;
struct scatterlist *s;
int i;
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea27..a311ffcaad1 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
#include <linux/platform_device.h>
-#include <linux/errno.h>
+#include <linux/err.h>
#include <linux/init.h>
static __init int add_pcspkr(void)
{
struct platform_device *pd;
- int ret;
- pd = platform_device_alloc("pcspkr", -1);
- if (!pd)
- return -ENOMEM;
+ pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
- ret = platform_device_add(pd);
- if (ret)
- platform_device_put(pd);
-
- return ret;
+ return IS_ERR(pd) ? PTR_ERR(pd) : 0;
}
device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 876e9189077..ec7a2ba9bce 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -185,7 +185,8 @@ static void mwait_idle(void)
static void poll_idle(void)
{
local_irq_enable();
- cpu_relax();
+ while (!need_resched())
+ cpu_relax();
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 31f40b24bf5..205188db962 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
#include <linux/tick.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
+#include <linux/dmi.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -56,6 +57,8 @@
#include <asm/cpu.h>
#include <asm/kdebug.h>
#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/smp.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
unsigned long d0, d1, d2, d3, d6, d7;
unsigned long sp;
unsigned short ss, gs;
+ const char *board;
if (user_mode_vm(regs)) {
sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
}
printk("\n");
- printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+
+ board = dmi_get_system_info(DMI_PRODUCT_NAME);
+ if (!board)
+ board = "";
+ printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
task_pid_nr(current), current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version, board);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
(u16)regs->cs, regs->ip, regs->flags,
@@ -277,6 +285,14 @@ void exit_thread(void)
tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(current->thread.ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(current->thread.ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -438,6 +454,35 @@ int set_tsc_mode(unsigned int val)
return 0;
}
+#ifdef CONFIG_X86_DS
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ unsigned long ds_prev = 0;
+ unsigned long ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /* we clear debugctl to make sure DS
+ * is not in use when we change it */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
+ }
+ return debugctl;
+}
+#else
+static int update_debugctl(struct thread_struct *prev,
+ struct thread_struct *next, unsigned long debugctl)
+{
+ return debugctl;
+}
+#endif /* CONFIG_X86_DS */
+
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss)
@@ -448,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
- }
+ debugctl = update_debugctl(prev, next, prev->debugctlmsr);
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -479,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
hard_enable_TSC();
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e12e0e4dd25..2a8ccb9238b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
#include <linux/kdebug.h>
#include <linux/tick.h>
#include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
-#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/system.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
#include <asm/proto.h>
#include <asm/ia32.h>
#include <asm/idle.h>
+#include <asm/syscalls.h>
asmlinkage extern void ret_from_fork(void);
@@ -88,7 +89,7 @@ void exit_idle(void)
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
-#include <asm/nmi.h>
+#include <linux/nmi.h>
/* We halt the CPU with physical CPU hotplug */
static inline void play_dead(void)
{
@@ -153,7 +154,7 @@ void cpu_idle(void)
}
/* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
unsigned long d0, d1, d2, d3, d6, d7;
@@ -162,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
printk("\n");
print_modules();
- printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+ printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+ printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
printk_address(regs->ip, 1);
- printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
- regs->flags);
- printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+ printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
+ regs->sp, regs->flags);
+ printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
- printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+ printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
regs->dx, regs->si, regs->di);
- printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+ printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
regs->bp, regs->r8, regs->r9);
- printk("R10: %016lx R11: %016lx R12: %016lx\n",
- regs->r10, regs->r11, regs->r12);
- printk("R13: %016lx R14: %016lx R15: %016lx\n",
- regs->r13, regs->r14, regs->r15);
-
- asm("movl %%ds,%0" : "=r" (ds));
- asm("movl %%cs,%0" : "=r" (cs));
- asm("movl %%es,%0" : "=r" (es));
+ printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+ regs->r10, regs->r11, regs->r12);
+ printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+ regs->r13, regs->r14, regs->r15);
+
+ asm("movl %%ds,%0" : "=r" (ds));
+ asm("movl %%cs,%0" : "=r" (cs));
+ asm("movl %%es,%0" : "=r" (es));
asm("movl %%fs,%0" : "=r" (fsindex));
asm("movl %%gs,%0" : "=r" (gsindex));
rdmsrl(MSR_FS_BASE, fs);
- rdmsrl(MSR_GS_BASE, gs);
- rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+ rdmsrl(MSR_GS_BASE, gs);
+ rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
cr4 = read_cr4();
- printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
- fs,fsindex,gs,gsindex,shadowgs);
- printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
- printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+ printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+ fs, fsindex, gs, gsindex, shadowgs);
+ printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+ es, cr0);
+ printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+ cr4);
get_debugreg(d0, 0);
get_debugreg(d1, 1);
get_debugreg(d2, 2);
- printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+ printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
get_debugreg(d3, 3);
get_debugreg(d6, 6);
get_debugreg(d7, 7);
- printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
}
void show_regs(struct pt_regs *regs)
{
- printk("CPU %d:", smp_processor_id());
+ printk(KERN_INFO "CPU %d:", smp_processor_id());
__show_regs(regs);
show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
}
@@ -240,6 +243,14 @@ void exit_thread(void)
t->io_bitmap_max = 0;
put_cpu();
}
+#ifdef CONFIG_X86_DS
+ /* Free any DS contexts that have not been properly released. */
+ if (unlikely(t->ds_ctx)) {
+ /* we clear debugctl to make sure DS is not used. */
+ update_debugctlmsr(0);
+ ds_free(t->ds_ctx);
+ }
+#endif /* CONFIG_X86_DS */
}
void flush_thread(void)
@@ -315,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
unsigned long unused,
- struct task_struct * p, struct pt_regs * regs)
+ struct task_struct *p, struct pt_regs *regs)
{
int err;
- struct pt_regs * childregs;
+ struct pt_regs *childregs;
struct task_struct *me = current;
childregs = ((struct pt_regs *)
@@ -363,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
if (test_thread_flag(TIF_IA32))
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
- else
-#endif
- err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
- if (err)
+ else
+#endif
+ err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+ if (err)
goto out;
}
err = 0;
@@ -473,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
next = &next_p->thread;
debugctl = prev->debugctlmsr;
- if (next->ds_area_msr != prev->ds_area_msr) {
- /* we clear debugctl to make sure DS
- * is not in use when we change it */
- debugctl = 0;
- update_debugctlmsr(0);
- wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+
+#ifdef CONFIG_X86_DS
+ {
+ unsigned long ds_prev = 0, ds_next = 0;
+
+ if (prev->ds_ctx)
+ ds_prev = (unsigned long)prev->ds_ctx->ds;
+ if (next->ds_ctx)
+ ds_next = (unsigned long)next->ds_ctx->ds;
+
+ if (ds_next != ds_prev) {
+ /*
+ * We clear debugctl to make sure DS
+ * is not in use when we change it:
+ */
+ debugctl = 0;
+ update_debugctlmsr(0);
+ wrmsrl(MSR_IA32_DS_AREA, ds_next);
+ }
}
+#endif /* CONFIG_X86_DS */
if (next->debugctlmsr != debugctl)
update_debugctlmsr(next->debugctlmsr);
@@ -517,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
}
/*
@@ -545,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
unsigned fsindex, gsindex;
/* we're going to use this soon, after a few expensive things */
- if (next_p->fpu_counter>5)
+ if (next_p->fpu_counter > 5)
prefetch(next->xstate);
/*
@@ -553,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
load_sp0(tss, next);
- /*
+ /*
* Switch DS and ES.
* This won't pick up thread selector changes, but I guess that is ok.
*/
savesegment(es, prev->es);
if (unlikely(next->es | prev->es))
- loadsegment(es, next->es);
+ loadsegment(es, next->es);
savesegment(ds, prev->ds);
if (unlikely(next->ds | prev->ds))
@@ -585,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_leave_lazy_cpu_mode();
- /*
+ /*
* Switch FS and GS.
*
* Segment register != 0 always requires a reload. Also
@@ -594,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
if (unlikely(fsindex | next->fsindex | prev->fs)) {
loadsegment(fs, next->fsindex);
- /*
+ /*
* Check if the user used a selector != 0; if yes
* clear 64bit base, since overloaded base is always
* mapped to the Null selector
*/
if (fsindex)
- prev->fs = 0;
+ prev->fs = 0;
}
/* when next process has a 64bit base use it */
if (next->fs)
@@ -610,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (unlikely(gsindex | next->gsindex | prev->gs)) {
load_gs_index(next->gsindex);
if (gsindex)
- prev->gs = 0;
+ prev->gs = 0;
}
if (next->gs)
wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -619,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Must be after DS reload */
unlazy_fpu(prev_p);
- /*
+ /*
* Switch the PDA and FPU contexts.
*/
prev->usersp = read_pda(oldrsp);
write_pda(oldrsp, next->usersp);
- write_pda(pcurrent, next_p);
+ write_pda(pcurrent, next_p);
write_pda(kernelstack,
(unsigned long)task_stack_page(next_p) +
@@ -665,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
char __user * __user *envp, struct pt_regs *regs)
{
long error;
- char * filename;
+ char *filename;
filename = getname(name);
error = PTR_ERR(filename);
@@ -723,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
unsigned long get_wchan(struct task_struct *p)
{
unsigned long stack;
- u64 fp,ip;
+ u64 fp, ip;
int count = 0;
- if (!p || p == current || p->state==TASK_RUNNING)
- return 0;
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
stack = (unsigned long)task_stack_page(p);
if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
return 0;
fp = *(u64 *)(p->thread.sp);
- do {
+ do {
if (fp < (unsigned long)stack ||
fp > (unsigned long)stack+THREAD_SIZE)
- return 0;
+ return 0;
ip = *(u64 *)(fp+8);
if (!in_sched_functions(ip))
return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
+ fp = *(u64 *)fp;
+ } while (count++ < 16);
return 0;
}
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{
- int ret = 0;
+{
+ int ret = 0;
int doit = task == current;
int cpu;
- switch (code) {
+ switch (code) {
case ARCH_SET_GS:
if (addr >= TASK_SIZE_OF(task))
- return -EPERM;
+ return -EPERM;
cpu = get_cpu();
- /* handle small bases via the GDT because that's faster to
+ /* handle small bases via the GDT because that's faster to
switch. */
- if (addr <= 0xffffffff) {
- set_32bit_tls(task, GS_TLS, addr);
- if (doit) {
+ if (addr <= 0xffffffff) {
+ set_32bit_tls(task, GS_TLS, addr);
+ if (doit) {
load_TLS(&task->thread, cpu);
- load_gs_index(GS_TLS_SEL);
+ load_gs_index(GS_TLS_SEL);
}
- task->thread.gsindex = GS_TLS_SEL;
+ task->thread.gsindex = GS_TLS_SEL;
task->thread.gs = 0;
- } else {
+ } else {
task->thread.gsindex = 0;
task->thread.gs = addr;
if (doit) {
load_gs_index(0);
ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
- }
+ }
}
put_cpu();
break;
@@ -825,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
rdmsrl(MSR_KERNEL_GS_BASE, base);
else
base = task->thread.gs;
- }
- else
+ } else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e37dccce85d..e375b658efc 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/regset.h>
+#include <linux/tracehook.h>
#include <linux/user.h>
#include <linux/elf.h>
#include <linux/security.h>
@@ -69,7 +70,7 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
-static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
regno >>= 2;
@@ -554,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
return 0;
}
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * The configuration for a particular BTS hardware implementation.
+ */
+struct bts_configuration {
+ /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
+ unsigned char sizeof_bts;
+ /* the size of a field in the BTS record in bytes */
+ unsigned char sizeof_field;
+ /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
+ unsigned long debugctl_mask;
+};
+static struct bts_configuration bts_cfg;
+
+#define BTS_MAX_RECORD_SIZE (8 * 3)
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ */
+
+enum bts_field {
+ bts_from = 0,
+ bts_to,
+ bts_flags,
+
+ bts_escape = (unsigned long)-1,
+ bts_qual = bts_to,
+ bts_jiffies = bts_flags
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+ base += (bts_cfg.sizeof_field * field);
+ return *(unsigned long *)base;
+}
-static int ptrace_bts_get_size(struct task_struct *child)
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
{
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ base += (bts_cfg.sizeof_field * field);;
+ (*(unsigned long *)base) = val;
+}
- return ds_get_bts_index((void *)child->thread.ds_area_msr);
+/*
+ * Translate a BTS record from the raw format into the bts_struct format
+ *
+ * out (out): bts_struct interpretation
+ * raw: raw BTS record
+ */
+static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
+{
+ memset(out, 0, sizeof(*out));
+ if (bts_get(raw, bts_from) == bts_escape) {
+ out->qualifier = bts_get(raw, bts_qual);
+ out->variant.jiffies = bts_get(raw, bts_jiffies);
+ } else {
+ out->qualifier = BTS_BRANCH;
+ out->variant.lbr.from_ip = bts_get(raw, bts_from);
+ out->variant.lbr.to_ip = bts_get(raw, bts_to);
+ }
}
-static int ptrace_bts_read_record(struct task_struct *child,
- long index,
+static int ptrace_bts_read_record(struct task_struct *child, size_t index,
struct bts_struct __user *out)
{
struct bts_struct ret;
- int retval;
- int bts_end;
- int bts_index;
-
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ const void *bts_record;
+ size_t bts_index, bts_end;
+ int error;
- if (index < 0)
- return -EINVAL;
+ error = ds_get_bts_end(child, &bts_end);
+ if (error < 0)
+ return error;
- bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
if (bts_end <= index)
return -EINVAL;
+ error = ds_get_bts_index(child, &bts_index);
+ if (error < 0)
+ return error;
+
/* translate the ptrace bts index into the ds bts index */
- bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
- bts_index -= (index + 1);
- if (bts_index < 0)
- bts_index += bts_end;
+ bts_index += bts_end - (index + 1);
+ if (bts_end <= bts_index)
+ bts_index -= bts_end;
- retval = ds_read_bts((void *)child->thread.ds_area_msr,
- bts_index, &ret);
- if (retval < 0)
- return retval;
+ error = ds_access_bts(child, bts_index, &bts_record);
+ if (error < 0)
+ return error;
+
+ ptrace_bts_translate_record(&ret, bts_record);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
@@ -600,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
return sizeof(ret);
}
-static int ptrace_bts_clear(struct task_struct *child)
-{
- if (!child->thread.ds_area_msr)
- return -ENXIO;
-
- return ds_clear((void *)child->thread.ds_area_msr);
-}
-
static int ptrace_bts_drain(struct task_struct *child,
long size,
struct bts_struct __user *out)
{
- int end, i;
- void *ds = (void *)child->thread.ds_area_msr;
-
- if (!ds)
- return -ENXIO;
+ struct bts_struct ret;
+ const unsigned char *raw;
+ size_t end, i;
+ int error;
- end = ds_get_bts_index(ds);
- if (end <= 0)
- return end;
+ error = ds_get_bts_index(child, &end);
+ if (error < 0)
+ return error;
if (size < (end * sizeof(struct bts_struct)))
return -EIO;
- for (i = 0; i < end; i++, out++) {
- struct bts_struct ret;
- int retval;
+ error = ds_access_bts(child, 0, (const void **)&raw);
+ if (error < 0)
+ return error;
- retval = ds_read_bts(ds, i, &ret);
- if (retval < 0)
- return retval;
+ for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
+ ptrace_bts_translate_record(&ret, raw);
if (copy_to_user(out, &ret, sizeof(ret)))
return -EFAULT;
}
- ds_clear(ds);
+ error = ds_clear_bts(child);
+ if (error < 0)
+ return error;
return end;
}
+static void ptrace_bts_ovfl(struct task_struct *child)
+{
+ send_sig(child->thread.bts_ovfl_signal, child, 0);
+}
+
static int ptrace_bts_config(struct task_struct *child,
long cfg_size,
const struct ptrace_bts_config __user *ucfg)
{
struct ptrace_bts_config cfg;
- int bts_size, ret = 0;
- void *ds;
+ int error = 0;
+
+ error = -EOPNOTSUPP;
+ if (!bts_cfg.sizeof_bts)
+ goto errout;
+ error = -EIO;
if (cfg_size < sizeof(cfg))
- return -EIO;
+ goto errout;
+ error = -EFAULT;
if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
+ goto errout;
- if ((int)cfg.size < 0)
- return -EINVAL;
+ error = -EINVAL;
+ if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
+ !(cfg.flags & PTRACE_BTS_O_ALLOC))
+ goto errout;
- bts_size = 0;
- ds = (void *)child->thread.ds_area_msr;
- if (ds) {
- bts_size = ds_get_bts_size(ds);
- if (bts_size < 0)
- return bts_size;
- }
- cfg.size = PAGE_ALIGN(cfg.size);
+ if (cfg.flags & PTRACE_BTS_O_ALLOC) {
+ ds_ovfl_callback_t ovfl = NULL;
+ unsigned int sig = 0;
+
+ /* we ignore the error in case we were not tracing child */
+ (void)ds_release_bts(child);
- if (bts_size != cfg.size) {
- ret = ptrace_bts_realloc(child, cfg.size,
- cfg.flags & PTRACE_BTS_O_CUT_SIZE);
- if (ret < 0)
+ if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+ if (!cfg.signal)
+ goto errout;
+
+ sig = cfg.signal;
+ ovfl = ptrace_bts_ovfl;
+ }
+
+ error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
+ if (error < 0)
goto errout;
- ds = (void *)child->thread.ds_area_msr;
+ child->thread.bts_ovfl_signal = sig;
}
- if (cfg.flags & PTRACE_BTS_O_SIGNAL)
- ret = ds_set_overflow(ds, DS_O_SIGNAL);
- else
- ret = ds_set_overflow(ds, DS_O_WRAP);
- if (ret < 0)
+ error = -EINVAL;
+ if (!child->thread.ds_ctx && cfg.flags)
goto errout;
if (cfg.flags & PTRACE_BTS_O_TRACE)
- child->thread.debugctlmsr |= ds_debugctl_mask();
+ child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
else
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
if (cfg.flags & PTRACE_BTS_O_SCHED)
set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
else
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- ret = sizeof(cfg);
+ error = sizeof(cfg);
out:
if (child->thread.debugctlmsr)
@@ -702,10 +778,10 @@ out:
else
clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- return ret;
+ return error;
errout:
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
goto out;
}
@@ -714,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
long cfg_size,
struct ptrace_bts_config __user *ucfg)
{
- void *ds = (void *)child->thread.ds_area_msr;
struct ptrace_bts_config cfg;
+ size_t end;
+ const void *base, *max;
+ int error;
if (cfg_size < sizeof(cfg))
return -EIO;
- memset(&cfg, 0, sizeof(cfg));
+ error = ds_get_bts_end(child, &end);
+ if (error < 0)
+ return error;
- if (ds) {
- cfg.size = ds_get_bts_size(ds);
+ error = ds_access_bts(child, /* index = */ 0, &base);
+ if (error < 0)
+ return error;
- if (ds_get_overflow(ds) == DS_O_SIGNAL)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
+ error = ds_access_bts(child, /* index = */ end, &max);
+ if (error < 0)
+ return error;
- if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
- child->thread.debugctlmsr & ds_debugctl_mask())
- cfg.flags |= PTRACE_BTS_O_TRACE;
+ memset(&cfg, 0, sizeof(cfg));
+ cfg.size = (max - base);
+ cfg.signal = child->thread.bts_ovfl_signal;
+ cfg.bts_size = sizeof(struct bts_struct);
- if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
- cfg.flags |= PTRACE_BTS_O_SCHED;
- }
+ if (cfg.signal)
+ cfg.flags |= PTRACE_BTS_O_SIGNAL;
- cfg.bts_size = sizeof(struct bts_struct);
+ if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+ child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+ cfg.flags |= PTRACE_BTS_O_TRACE;
+
+ if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+ cfg.flags |= PTRACE_BTS_O_SCHED;
if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
return -EFAULT;
@@ -744,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
return sizeof(cfg);
}
-
static int ptrace_bts_write_record(struct task_struct *child,
const struct bts_struct *in)
{
- int retval;
+ unsigned char bts_record[BTS_MAX_RECORD_SIZE];
- if (!child->thread.ds_area_msr)
- return -ENXIO;
+ BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
- retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
- if (retval)
- return retval;
+ memset(bts_record, 0, bts_cfg.sizeof_bts);
+ switch (in->qualifier) {
+ case BTS_INVALID:
+ break;
- return sizeof(*in);
-}
+ case BTS_BRANCH:
+ bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
+ bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
+ break;
-static int ptrace_bts_realloc(struct task_struct *child,
- int size, int reduce_size)
-{
- unsigned long rlim, vm;
- int ret, old_size;
+ case BTS_TASK_ARRIVES:
+ case BTS_TASK_DEPARTS:
+ bts_set(bts_record, bts_from, bts_escape);
+ bts_set(bts_record, bts_qual, in->qualifier);
+ bts_set(bts_record, bts_jiffies, in->variant.jiffies);
+ break;
- if (size < 0)
+ default:
return -EINVAL;
-
- old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
- if (old_size < 0)
- return old_size;
-
- ret = ds_free((void **)&child->thread.ds_area_msr);
- if (ret < 0)
- goto out;
-
- size >>= PAGE_SHIFT;
- old_size >>= PAGE_SHIFT;
-
- current->mm->total_vm -= old_size;
- current->mm->locked_vm -= old_size;
-
- if (size == 0)
- goto out;
-
- rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->total_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->total_vm;
- if (size <= 0)
- goto out;
- }
-
- rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
- vm = current->mm->locked_vm + size;
- if (rlim < vm) {
- ret = -ENOMEM;
-
- if (!reduce_size)
- goto out;
-
- size = rlim - current->mm->locked_vm;
- if (size <= 0)
- goto out;
}
- ret = ds_allocate((void **)&child->thread.ds_area_msr,
- size << PAGE_SHIFT);
- if (ret < 0)
- goto out;
-
- current->mm->total_vm += size;
- current->mm->locked_vm += size;
-
-out:
- if (child->thread.ds_area_msr)
- set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
- else
- clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
- return ret;
+ /* The writing task will be the switched-to task on a context
+ * switch. It needs to write into the switched-from task's BTS
+ * buffer. */
+ return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
}
void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
ptrace_bts_write_record(tsk, &rec);
}
-#endif /* X86_BTS */
+
+static const struct bts_configuration bts_cfg_netburst = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
+};
+
+static const struct bts_configuration bts_cfg_pentium_m = {
+ .sizeof_bts = sizeof(long) * 3,
+ .sizeof_field = sizeof(long),
+ .debugctl_mask = (1<<6)|(1<<7)
+};
+
+static const struct bts_configuration bts_cfg_core2 = {
+ .sizeof_bts = 8 * 3,
+ .sizeof_field = 8,
+ .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void bts_configure(const struct bts_configuration *cfg)
+{
+ bts_cfg = *cfg;
+}
+
+void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+{
+ switch (c->x86) {
+ case 0x6:
+ switch (c->x86_model) {
+ case 0xD:
+ case 0xE: /* Pentium M */
+ bts_configure(&bts_cfg_pentium_m);
+ break;
+ case 0xF: /* Core2 */
+ case 0x1C: /* Atom */
+ bts_configure(&bts_cfg_core2);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ case 0xF:
+ switch (c->x86_model) {
+ case 0x0:
+ case 0x1:
+ case 0x2: /* Netburst */
+ bts_configure(&bts_cfg_netburst);
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+ break;
+ default:
+ /* sorry, don't know about them */
+ break;
+ }
+}
+#endif /* CONFIG_X86_PTRACE_BTS */
/*
* Called by kernel/ptrace.c when detaching..
@@ -852,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
#endif
- if (child->thread.ds_area_msr) {
-#ifdef X86_BTS
- ptrace_bts_realloc(child, 0, 0);
-#endif
- child->thread.debugctlmsr &= ~ds_debugctl_mask();
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
- }
+#ifdef CONFIG_X86_PTRACE_BTS
+ (void)ds_release_bts(child);
+
+ child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+ if (!child->thread.debugctlmsr)
+ clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+ clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+#endif /* CONFIG_X86_PTRACE_BTS */
}
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
/*
* These bits need more cooking - not enabled yet:
*/
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
case PTRACE_BTS_CONFIG:
ret = ptrace_bts_config
(child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_SIZE:
- ret = ptrace_bts_get_size(child);
+ ret = ds_get_bts_index(child, /* pos = */ NULL);
break;
case PTRACE_BTS_GET:
@@ -1001,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
+ ret = ds_clear_bts(child);
break;
case PTRACE_BTS_DRAIN:
ret = ptrace_bts_drain
(child, data, (struct bts_struct __user *) addr);
break;
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
default:
ret = ptrace_request(child, request, addr, data);
@@ -1375,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
force_sig_info(SIGTRAP, &info, tsk);
}
-static void syscall_trace(struct pt_regs *regs)
-{
- if (!(current->ptrace & PT_PTRACED))
- return;
-
-#if 0
- printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
- current->comm,
- regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
- current_thread_info()->flags, current->ptrace);
-#endif
-
- ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
- ? 0x80 : 0));
- /*
- * this isn't the same as continuing with a signal, but it will do
- * for normal use. strace only continues with a signal if the
- * stopping signal is not SIGTRAP. -brl
- */
- if (current->exit_code) {
- send_sig(current->exit_code, current, 1);
- current->exit_code = 0;
- }
-}
#ifdef CONFIG_X86_32
# define IS_IA32 1
@@ -1432,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
ret = -1L;
- if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ ret = -1L;
if (unlikely(current->audit_context)) {
if (IS_IA32)
@@ -1459,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
if (test_thread_flag(TIF_SYSCALL_TRACE))
- syscall_trace(regs);
+ tracehook_report_syscall_exit(regs, 0);
/*
* If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
* system call instruction.
*/
if (test_thread_flag(TIF_SINGLESTEP) &&
- (current->ptrace & PT_PTRACED))
+ tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
send_sigtrap(current, regs, 0);
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb..f4c93f1cfc1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
static const struct desc_ptr no_idt = {};
static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+/*
+ * Keyboard reset and triple fault may result in INIT, not RESET, which
+ * doesn't work when we're in vmx root mode. Try ACPI first.
+ */
+enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9838f2539df..141efab5240 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
#define RAMDISK_LOAD_FLAG 0x4000
static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
@@ -665,6 +668,19 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+#endif
+
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 76e305e064f..0e67f72d931 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -162,9 +162,16 @@ void __init setup_per_cpu_areas(void)
printk(KERN_INFO
"cpu %d has no node %d or node-local memory\n",
cpu, node);
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+ cpu, __pa(ptr));
}
- else
+ else {
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+ if (ptr)
+ printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
+ cpu, node, __pa(ptr));
+ }
#endif
per_cpu_offset(cpu) = ptr - __per_cpu_start;
memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2d..8b4956e800a 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -24,4 +24,9 @@ struct rt_sigframe {
struct ucontext uc;
struct siginfo info;
};
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+ sigset_t *set, struct pt_regs *regs);
#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 6fb5bcdd893..2a2435d3037 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/tracehook.h>
#include <linux/elf.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -26,6 +27,7 @@
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/vdso.h>
+#include <asm/syscalls.h>
#include "sigframe.h"
@@ -558,8 +560,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,6 +568,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
+
return 0;
}
@@ -661,5 +664,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
clear_thread_flag(TIF_IRET);
}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ca316b5b742..694aa888bb1 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
#include <linux/errno.h>
#include <linux/wait.h>
#include <linux/ptrace.h>
+#include <linux/tracehook.h>
#include <linux/unistd.h>
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compiler.h>
+#include <linux/uaccess.h>
+
#include <asm/processor.h>
#include <asm/ucontext.h>
-#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/mce.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
# define FIX_EFLAGS __FIX_EFLAGS
#endif
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs);
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs * regs);
-
asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
@@ -128,7 +127,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
-#define COPY(x) err |= __get_user(regs->x, &sc->x)
+#define COPY(x) (err |= __get_user(regs->x, &sc->x))
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
COPY(dx); COPY(cx); COPY(ip);
@@ -158,7 +157,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
}
{
- struct _fpstate __user * buf;
+ struct _fpstate __user *buf;
err |= __get_user(buf, &sc->fpstate);
if (buf) {
@@ -198,7 +197,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -208,16 +207,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
return ax;
badframe:
- signal_fault(regs,frame,"sigreturn");
+ signal_fault(regs, frame, "sigreturn");
return 0;
-}
+}
/*
* Set up a signal frame.
*/
static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+ unsigned long mask, struct task_struct *me)
{
int err = 0;
@@ -273,35 +273,35 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs)
+ sigset_t *set, struct pt_regs *regs)
{
struct rt_sigframe __user *frame;
- struct _fpstate __user *fp = NULL;
+ struct _fpstate __user *fp = NULL;
int err = 0;
struct task_struct *me = current;
if (used_math()) {
- fp = get_stack(ka, regs, sizeof(struct _fpstate));
+ fp = get_stack(ka, regs, sizeof(struct _fpstate));
frame = (void __user *)round_down(
(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
goto give_sigsegv;
- if (save_i387(fp) < 0)
- err |= -1;
+ if (save_i387(fp) < 0)
+ err |= -1;
} else
frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
- if (ka->sa.sa_flags & SA_SIGINFO) {
+ if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
if (err)
goto give_sigsegv;
}
-
+
/* Create the ucontext. */
err |= __put_user(0, &frame->uc.uc_flags);
err |= __put_user(0, &frame->uc.uc_link);
@@ -311,9 +311,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
- if (sizeof(*set) == 16) {
+ if (sizeof(*set) == 16) {
__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
- __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
+ __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
} else
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -324,7 +324,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
} else {
/* could use a vstub here */
- goto give_sigsegv;
+ goto give_sigsegv;
}
if (err)
@@ -332,7 +332,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
/* Set up registers for signal handler */
regs->di = sig;
- /* In case the signal handler was declared without prototypes */
+ /* In case the signal handler was declared without prototypes */
regs->ax = 0;
/* This also works for non SA_SIGINFO handlers because they expect the
@@ -355,37 +355,8 @@ give_sigsegv:
}
/*
- * Return -1L or the syscall number that @regs is executing.
- */
-static long current_syscall(struct pt_regs *regs)
-{
- /*
- * We always sign-extend a -1 value being set here,
- * so this is always either -1L or a syscall number.
- */
- return regs->orig_ax;
-}
-
-/*
- * Return a value that is -EFOO if the system call in @regs->orig_ax
- * returned an error. This only works for @regs from @current.
- */
-static long current_syscall_ret(struct pt_regs *regs)
-{
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
- /*
- * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
- * and will match correctly in comparisons.
- */
- return (int) regs->ax;
-#endif
- return regs->ax;
-}
-
-/*
* OK, we're invoking a handler
- */
+ */
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -394,9 +365,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
int ret;
/* Are we from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTART_RESTARTBLOCK:
case -ERESTARTNOHAND:
regs->ax = -EINTR;
@@ -429,7 +400,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
else
ret = ia32_setup_frame(sig, ka, oldset, regs);
- } else
+ } else
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
@@ -453,15 +424,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* handler too.
*/
regs->flags &= ~X86_EFLAGS_TF;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+ sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked,sig);
+ sigaddset(&current->blocked, sig);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+
+ tracehook_signal_handler(sig, info, ka, regs,
+ test_thread_flag(TIF_SINGLESTEP));
}
return ret;
@@ -518,9 +490,9 @@ static void do_signal(struct pt_regs *regs)
}
/* Did we come from a system call? */
- if (current_syscall(regs) >= 0) {
+ if (syscall_get_nr(current, regs) >= 0) {
/* Restart the system call - no handlers present */
- switch (current_syscall_ret(regs)) {
+ switch (syscall_get_error(current, regs)) {
case -ERESTARTNOHAND:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
@@ -558,17 +530,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
/* deal with pending signal delivery */
if (thread_info_flags & _TIF_SIGPENDING)
do_signal(regs);
+
+ if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{
- struct task_struct *me = current;
+{
+ struct task_struct *me = current;
if (show_unhandled_signals && printk_ratelimit()) {
printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
- me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+ me->comm, me->pid, where, frame, regs->ip,
+ regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
printk("\n");
}
- force_sig(SIGSEGV, me);
-}
+ force_sig(SIGSEGV, me);
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7985c5b3f91..45531e3ba19 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -88,7 +88,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
#else
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
#endif
@@ -129,7 +129,7 @@ static int boot_cpu_logical_apicid;
static cpumask_t cpu_sibling_setup_map;
/* Set if we find a B stepping CPU */
-int __cpuinitdata smp_b_stepping;
+static int __cpuinitdata smp_b_stepping;
#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
@@ -1313,16 +1313,13 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
-#ifdef CONFIG_HOTPLUG_CPU
if (additional_cpus == -1) {
if (disabled_cpus > 0)
additional_cpus = disabled_cpus;
else
additional_cpus = 0;
}
-#else
- additional_cpus = 0;
-#endif
+
possible = num_processors + additional_cpus;
if (possible > NR_CPUS)
possible = NR_CPUS;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a6..1884a8d12bf 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
#include <linux/uaccess.h>
#include <linux/unistd.h>
+#include <asm/syscalls.h>
+
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef3381..6bc211accf0 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
#include <linux/utsname.h>
#include <linux/personality.h>
#include <linux/random.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/ia32.h>
+#include <asm/syscalls.h>
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long off)
{
long error;
- struct file * file;
+ struct file *file;
error = -EINVAL;
if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unmapped base down for this case. This can give
conflicts with the heap, but we assume that glibc
malloc knows how to fall back to mmap. Give it 1GB
- of playground for now. -AK */
- *begin = 0x40000000;
- *end = 0x80000000;
+ of playground for now. -AK */
+ *begin = 0x40000000;
+ *end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
}
} else {
*begin = TASK_UNMAPPED_BASE;
- *end = TASK_SIZE;
+ *end = TASK_SIZE;
}
-}
+}
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
unsigned long start_addr;
unsigned long begin, end;
-
+
if (flags & MAP_FIXED)
return addr;
- find_start_end(flags, &begin, &end);
+ find_start_end(flags, &begin, &end);
if (len > end)
return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
}
if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
&& len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
+ mm->cached_hole_size = 0;
mm->free_area_cache = begin;
}
addr = mm->free_area_cache;
- if (addr < begin)
- addr = begin;
+ if (addr < begin)
+ addr = begin;
start_addr = addr;
full_search:
@@ -127,7 +129,7 @@ full_search:
return addr;
}
if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
+ mm->cached_hole_size = vma->vm_start - addr;
addr = vma->vm_end;
}
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr-len);
if (!vma || addr <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr-len);
+ return mm->free_area_cache = addr-len;
}
if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
vma = find_vma(mm, addr);
if (!vma || addr+len <= vma->vm_start)
/* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
+ return mm->free_area_cache = addr;
/* remember the largest hole we saw so far */
if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
}
-asmlinkage long sys_uname(struct new_utsname __user * name)
+asmlinkage long sys_uname(struct new_utsname __user *name)
{
int err;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- if (personality(current->personality) == PER_LINUX32)
- err |= copy_to_user(&name->machine, "i686", 5);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
return err ? -EFAULT : 0;
}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c1748..3d1be4f0fac 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
#define __NO_STUBS
#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
#include <asm/unistd_64.h>
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_64_UNISTD_H_
+#undef ASM_X86__UNISTD_64_H
typedef void (*sys_call_ptr_t)(void);
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index ffe3c664afc..bbecf8b6bf9 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
#include <asm/time.h>
+#include <asm/timer.h>
#include "do_timer.h"
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a30..6bb7b8579e7 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/proto.h>
+#include <asm/syscalls.h>
#include "tls.h"
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 513caaca711..7a31f104bef 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
#include <linux/bug.h>
#include <linux/nmi.h>
#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
@@ -45,9 +47,6 @@
#include <asm/unwind.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
void printk_address(unsigned long address, int reliable)
{
- printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
+ printk(" [<%016lx>] %s%pS\n",
+ address, reliable ? "" : "? ", (void *) address);
}
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
[STACKFAULT_STACK - 1] = "#SS",
[MCE_STACK - 1] = "#MC",
#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+ [N_EXCEPTION_STACKS ...
+ N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
#endif
};
unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
}
/*
- * x86-64 can have up to three kernel stacks:
+ * x86-64 can have up to three kernel stacks:
* process stack
* interrupt stack
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
+ unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
unsigned used = 0;
struct thread_info *tinfo;
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!bp) {
if (task == current) {
/* Grab bp right from our regs */
- asm("movq %%rbp, %0" : "=r" (bp) :);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
} else {
/* bp is the last reg pushed by switch_to */
bp = *(unsigned long *) task->thread.sp;
@@ -339,9 +340,8 @@ static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp, char *log_lvl)
{
- printk("\nCall Trace:\n");
+ printk("Call Trace:\n");
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
- printk("\n");
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -357,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack;
int i;
const int cpu = smp_processor_id();
- unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
- unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+ unsigned long *irqstack_end =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+ unsigned long *irqstack =
+ (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
- // debugging aid: "show_stack(NULL, NULL);" prints the
- // back trace for this cpu.
+ /*
+ * debugging aid: "show_stack(NULL, NULL);" prints the
+ * back trace for this cpu.
+ */
if (sp == NULL) {
if (task)
@@ -386,6 +390,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
+ printk("\n");
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
@@ -404,7 +409,7 @@ void dump_stack(void)
#ifdef CONFIG_FRAME_POINTER
if (!bp)
- asm("movq %%rbp, %0" : "=r" (bp):);
+ asm("movq %%rbp, %0" : "=r" (bp) : );
#endif
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
init_utsname()->version);
show_trace(NULL, NULL, &stack, bp);
}
-
EXPORT_SYMBOL(dump_stack);
void show_registers(struct pt_regs *regs)
@@ -443,7 +447,6 @@ void show_registers(struct pt_regs *regs)
printk("Stack: ");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
regs->bp, "");
- printk("\n");
printk(KERN_EMERG "Code: ");
@@ -493,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
raw_local_irq_save(flags);
cpu = smp_processor_id();
if (!__raw_spin_trylock(&die_lock)) {
- if (cpu == die_owner)
+ if (cpu == die_owner)
/* nested oops. should stop eventually */;
else
__raw_spin_lock(&die_lock);
@@ -638,7 +641,7 @@ kernel_trap:
}
#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
@@ -648,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
+asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
siginfo_t info; \
info.si_signo = signr; \
@@ -683,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
}
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
+asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
@@ -778,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
}
static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
- if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
+ NOTIFY_STOP)
return;
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
@@ -882,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
else if (user_mode(eregs))
regs = task_pt_regs(current);
/* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ kernel process stack. */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -891,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
}
/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
+asmlinkage void __kprobes do_debug(struct pt_regs *regs,
unsigned long error_code)
{
struct task_struct *tsk = current;
@@ -1035,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
asmlinkage void bad_intr(void)
{
- printk("bad interrupt");
+ printk("bad interrupt");
}
asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1047,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
conditional_sti(regs);
if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
+ kernel_math_error(regs, "kernel simd math error", 19))
return;
/*
@@ -1092,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
force_sig_info(SIGFPE, &info, task);
}
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
+asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
{
}
@@ -1149,8 +1153,10 @@ void __init trap_init(void)
set_intr_gate(0, &divide_error);
set_intr_gate_ist(1, &debug, DEBUG_STACK);
set_intr_gate_ist(2, &nmi, NMI_STACK);
- set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
- set_system_gate(4, &overflow); /* int4 can be called from all */
+ /* int3 can be called from all */
+ set_system_gate_ist(3, &int3, DEBUG_STACK);
+ /* int4 can be called from all */
+ set_system_gate(4, &overflow);
set_intr_gate(5, &bounds);
set_intr_gate(6, &invalid_op);
set_intr_gate(7, &device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9de1b8..161bb850fc4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
/*
* Read TSC and the reference counters. Take care of SMI disturbance
*/
-static u64 tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
for (i = 0; i < MAX_RETRIES; i++) {
t1 = get_cycles();
if (hpet)
- *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+ *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
else
- *pm = acpi_pm_read_early();
+ *p = acpi_pm_read_early();
t2 = get_cycles();
if ((t2 - t1) < SMI_TRESHOLD)
return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
}
/*
+ * Calculate the TSC frequency from HPET reference
+ */
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
+{
+ u64 tmp;
+
+ if (hpet2 < hpet1)
+ hpet2 += 0x100000000ULL;
+ hpet2 -= hpet1;
+ tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+ do_div(tmp, 1000000);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+ u64 tmp;
+
+ if (!pm1 && !pm2)
+ return ULONG_MAX;
+
+ if (pm2 < pm1)
+ pm2 += (u64)ACPI_PM_OVRRUN;
+ pm2 -= pm1;
+ tmp = pm2 * 1000000000LL;
+ do_div(tmp, PMTMR_TICKS_PER_SEC);
+ do_div(deltatsc, tmp);
+
+ return (unsigned long) deltatsc;
+}
+
+#define CAL_MS 10
+#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS 1000
+
+#define CAL2_MS 50
+#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS 5000
+
+
+/*
* Try to calibrate the TSC against the Programmable
* Interrupt Timer and return the frequency of the TSC
* in kHz.
*
* Return ULONG_MAX on failure to calibrate.
*/
-static unsigned long pit_calibrate_tsc(void)
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
{
u64 tsc, t1, t2, delta;
unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
* (LSB then MSB) to begin countdown.
*/
outb(0xb0, 0x43);
- outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
- outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
+ outb(latch & 0xff, 0x42);
+ outb(latch >> 8, 0x42);
tsc = t1 = t2 = get_cycles();
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
/*
* Sanity checks:
*
- * If we were not able to read the PIT more than 5000
+ * If we were not able to read the PIT more than loopmin
* times, then we have been hit by a massive SMI
*
* If the maximum is 10 times larger than the minimum,
* then we got hit by an SMI as well.
*/
- if (pitcnt < 5000 || tscmax > 10 * tscmin)
+ if (pitcnt < loopmin || tscmax > 10 * tscmin)
return ULONG_MAX;
/* Calculate the PIT value */
delta = t2 - t1;
- do_div(delta, 50);
+ do_div(delta, ms);
return delta;
}
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ * - the PIT is running at roughly 1.19MHz
+ *
+ * - each IO is going to take about 1us on real hardware,
+ * but we allow it to be much faster (by a factor of 10) or
+ * _slightly_ slower (ie we allow up to a 2us read+counter
+ * update - anything else implies a unacceptably slow CPU
+ * or PIT for the fast calibration to work.
+ *
+ * - with 256 PIT ticks to read the value, we have 214us to
+ * see the same MSB (and overhead like doing a single TSC
+ * read per MSB value etc).
+ *
+ * - We're doing 2 reads per loop (LSB, MSB), and we expect
+ * them each to take about a microsecond on real hardware.
+ * So we expect a count value of around 100. But we'll be
+ * generous, and accept anything over 50.
+ *
+ * - if the PIT is stuck, and we see *many* more reads, we
+ * return early (and the next caller of pit_expect_msb()
+ * then consider it a failure when they don't see the
+ * next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+ int count = 0;
+
+ for (count = 0; count < 50000; count++) {
+ /* Ignore LSB */
+ inb(0x42);
+ if (inb(0x42) != val)
+ break;
+ }
+ return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+ /* Set the Gate high, disable speaker */
+ outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+ /*
+ * Counter 2, mode 0 (one-shot), binary count
+ *
+ * NOTE! Mode 2 decrements by two (and then the
+ * output is flipped each time, giving the same
+ * final output frequency as a decrement-by-one),
+ * so mode 0 is much better when looking at the
+ * individual counts.
+ */
+ outb(0xb0, 0x43);
+
+ /* Start at 0xffff */
+ outb(0xff, 0x42);
+ outb(0xff, 0x42);
+
+ if (pit_expect_msb(0xff)) {
+ int i;
+ u64 t1, t2, delta;
+ unsigned char expect = 0xfe;
+
+ t1 = get_cycles();
+ for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+ if (!pit_expect_msb(expect))
+ goto failed;
+ }
+ t2 = get_cycles();
+
+ /*
+ * Make sure we can rely on the second TSC timestamp:
+ */
+ if (!pit_expect_msb(expect))
+ goto failed;
+
+ /*
+ * Ok, if we get here, then we've seen the
+ * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+ * times, and each MSB had many hits, so we never
+ * had any sudden jumps.
+ *
+ * As a result, we can depend on there not being
+ * any odd delays anywhere, and the TSC reads are
+ * reliable.
+ *
+ * kHz = ticks / time-in-seconds / 1000;
+ * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+ * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+ */
+ delta = (t2 - t1)*PIT_TICK_RATE;
+ do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+ printk("Fast TSC calibration using PIT\n");
+ return delta;
+ }
+failed:
+ return 0;
+}
/**
* native_calibrate_tsc - calibrate the tsc on boot
*/
unsigned long native_calibrate_tsc(void)
{
- u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2;
+ u64 tsc1, tsc2, delta, ref1, ref2;
unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
- unsigned long flags;
- int hpet = is_hpet_enabled(), i;
+ unsigned long flags, latch, ms, fast_calibrate;
+ int hpet = is_hpet_enabled(), i, loopmin;
+
+ local_irq_save(flags);
+ fast_calibrate = quick_pit_calibrate();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
/*
* Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
*/
- for (i = 0; i < 5; i++) {
+
+ /* Preset PIT loop values */
+ latch = CAL_LATCH;
+ ms = CAL_MS;
+ loopmin = CAL_PIT_LOOPS;
+
+ for (i = 0; i < 3; i++) {
unsigned long tsc_pit_khz;
/*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
* read the end value.
*/
local_irq_save(flags);
- tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
- tsc_pit_khz = pit_calibrate_tsc();
- tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+ tsc1 = tsc_read_refs(&ref1, hpet);
+ tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+ tsc2 = tsc_read_refs(&ref2, hpet);
local_irq_restore(flags);
/* Pick the lowest PIT TSC calibration so far */
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !pm1 && !pm2)
+ if (!hpet && !ref1 && !ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
continue;
tsc2 = (tsc2 - tsc1) * 1000000LL;
+ if (hpet)
+ tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+ else
+ tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
- if (hpet) {
- if (hpet2 < hpet1)
- hpet2 += 0x100000000ULL;
- hpet2 -= hpet1;
- tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
- do_div(tsc1, 1000000);
- } else {
- if (pm2 < pm1)
- pm2 += (u64)ACPI_PM_OVRRUN;
- pm2 -= pm1;
- tsc1 = pm2 * 1000000000LL;
- do_div(tsc1, PMTMR_TICKS_PER_SEC);
+ tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+ /* Check the reference deviation */
+ delta = ((u64) tsc_pit_min) * 100;
+ do_div(delta, tsc_ref_min);
+
+ /*
+ * If both calibration results are inside a 10% window
+ * then we can be sure, that the calibration
+ * succeeded. We break out of the loop right away. We
+ * use the reference value, as it is more precise.
+ */
+ if (delta >= 90 && delta <= 110) {
+ printk(KERN_INFO
+ "TSC: PIT calibration matches %s. %d loops\n",
+ hpet ? "HPET" : "PMTIMER", i + 1);
+ return tsc_ref_min;
}
- do_div(tsc2, tsc1);
- tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+ /*
+ * Check whether PIT failed more than once. This
+ * happens in virtualized environments. We need to
+ * give the virtual PC a slightly longer timeframe for
+ * the HPET/PMTIMER to make the result precise.
+ */
+ if (i == 1 && tsc_pit_min == ULONG_MAX) {
+ latch = CAL2_LATCH;
+ ms = CAL2_MS;
+ loopmin = CAL2_PIT_LOOPS;
+ }
}
/*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
/* We don't have an alternative source, disable TSC */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk("TSC: No reference (HPET/PMTIMER) available\n");
return 0;
}
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
/* The alternative source failed as well, disable TSC */
if (tsc_ref_min == ULONG_MAX) {
printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
- "failed due to SMI disturbance.\n");
+ "failed.\n");
return 0;
}
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
}
/* We don't have an alternative source, use the PIT calibration value */
- if (!hpet && !pm1 && !pm2) {
+ if (!hpet && !ref1 && !ref2) {
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
/* The alternative source failed, use the PIT calibration value */
if (tsc_ref_min == ULONG_MAX) {
- printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due "
- "to SMI disturbance. Using PIT calibration\n");
+ printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+ "Using PIT calibration\n");
return tsc_pit_min;
}
- /* Check the reference deviation */
- delta = ((u64) tsc_pit_min) * 100;
- do_div(delta, tsc_ref_min);
-
- /*
- * If both calibration results are inside a 5% window, the we
- * use the lower frequency of those as it is probably the
- * closest estimate.
- */
- if (delta >= 95 && delta <= 105) {
- printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
- hpet ? "HPET" : "PMTIMER");
- printk(KERN_INFO "TSC: using %s calibration value\n",
- tsc_pit_min <= tsc_ref_min ? "PIT" :
- hpet ? "HPET" : "PMTIMER");
- return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
- }
-
- printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
- hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
-
/*
* The calibration values differ too much. In doubt, we use
* the PIT value as we know that there are PMTIMERs around
- * running at double speed.
+ * running at double speed. At least we let the user know:
*/
+ printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+ hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
printk(KERN_INFO "TSC: Using PIT calibration value\n");
return tsc_pit_min;
}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a6..61a97e616f7 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
#include <asm/visws/cobalt.h>
#include <asm/visws/piix4.h>
#include <asm/arch_hooks.h>
+#include <asm/io_apic.h>
#include <asm/fixmap.h>
#include <asm/reboot.h>
#include <asm/setup.h>
#include <asm/e820.h>
-#include <asm/smp.h>
#include <asm/io.h>
#include <mach_ipi.h>
#include "mach_apic.h"
-#include <linux/init.h>
-#include <linux/smp.h>
-
#include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/apic.h>
#include <asm/i8259.h>
#include <asm/irq_vectors.h>
-#include <asm/visws/cobalt.h>
#include <asm/visws/lithium.h>
-#include <asm/visws/piix4.h>
#include <linux/sched.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/pci.h>
#include <linux/pci_ids.h>
extern int no_broadcast;
-#include <asm/io.h>
#include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/lithium.h>
char visws_board_type = -1;
char visws_board_rev = -1;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d..4eeb5cf9720 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
+#include <asm/syscalls.h>
/*
* Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index edfb09f3047..8c9ad02af5a 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pte(u32 pfn)
+static void vmi_release_pte(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pmd(u32 pfn)
+static void vmi_release_pmd(unsigned long pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);