From 849620fab413355eff48232eac5a8c53c57615c5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 14 May 2009 17:10:52 +0200 Subject: Revert "oprofile: discover counters for op ppro too" This reverts commit 59512900baab03c5629f2ff5efad1d5d4e682ece. arch_perfmon_setup_counters() is actually never called for ppro, so there is no code that changes the numbers in op_ppro_spec. The patch as it is has no effect. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 8 +++----- arch/x86/oprofile/op_x86_model.h | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 10131fbdaad..2a123990a84 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -213,9 +213,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec op_ppro_spec = { - .num_counters = 2, /* can be overriden */ - .num_controls = 2, /* dito */ +struct op_x86_model_spec const op_ppro_spec = { + .num_counters = 2, + .num_controls = 2, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -251,8 +251,6 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; - op_ppro_spec.num_counters = num_counters; - op_ppro_spec.num_controls = num_counters; } struct op_x86_model_spec op_arch_perfmon_spec = { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 825e79064d6..2317149c94f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -45,7 +45,7 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; -- cgit v1.2.3-70-g09d2 From e419294ed3c98cccc145202e4fe165bfd8099d63 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sun, 12 Oct 2008 15:12:34 -0400 Subject: x86/oprofile: moving arch_perfmon counter setup to op_x86_model_spec.init The function arch_perfmon_init() in nmi_int.c is model specific. This patch moves it to op_model_ppro.c by using the init function pointer in struct op_x86_model_spec. Cc: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 21 +++++++++------------ arch/x86/oprofile/op_model_ppro.c | 9 ++++++++- arch/x86/oprofile/op_x86_model.h | 2 -- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3b285e656e2..dd8515301fb 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -427,7 +427,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - arch_perfmon_setup_counters(); + model = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -442,16 +442,6 @@ static int __init ppro_init(char **cpu_type) return 1; } -static int __init arch_perfmon_init(char **cpu_type) -{ - if (!cpu_has_arch_perfmon) - return 0; - *cpu_type = "i386/arch_perfmon"; - model = &op_arch_perfmon_spec; - arch_perfmon_setup_counters(); - return 1; -} - /* in order to get sysfs right */ static int using_nmi; @@ -509,8 +499,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) break; } - if (!cpu_type && !arch_perfmon_init(&cpu_type)) + if (cpu_type) + break; + + if (!cpu_has_arch_perfmon) return -ENODEV; + + /* use arch perfmon as fallback */ + cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; break; default: diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 2a123990a84..ae581196688 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -233,7 +233,7 @@ struct op_x86_model_spec const op_ppro_spec = { * the specific CPU. */ -void arch_perfmon_setup_counters(void) +static void arch_perfmon_setup_counters(void) { union cpuid10_eax eax; @@ -253,7 +253,14 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_controls = num_counters; } +static int arch_perfmon_init(struct oprofile_operations *ignore) +{ + arch_perfmon_setup_counters(); + return 0; +} + struct op_x86_model_spec op_arch_perfmon_spec = { + .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, /* user space does the cpuid check for available events */ diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 2317149c94f..ed27783bb0d 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -51,6 +51,4 @@ extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; -extern void arch_perfmon_setup_counters(void); - #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3-70-g09d2 From 06552ccc36abeb12e37efc16c384dc7f30794f85 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 28 May 2009 02:12:36 +0200 Subject: x86/oprofile: minor style changes in struct op_x86_model_spec Some vertical alignments. Variables are now located in the beginning of the struct. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_x86_model.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index ed27783bb0d..bd8157d12ff 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -32,17 +32,17 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { - int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - unsigned int num_counters; - unsigned int num_controls; - void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); - int (*check_ctrs)(struct pt_regs * const regs, - struct op_msrs const * const msrs); - void (*start)(struct op_msrs const * const msrs); - void (*stop)(struct op_msrs const * const msrs); - void (*shutdown)(struct op_msrs const * const msrs); + unsigned int num_counters; + unsigned int num_controls; + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); + void (*fill_in_addresses)(struct op_msrs * const msrs); + void (*setup_ctrs)(struct op_msrs const * const msrs); + int (*check_ctrs)(struct pt_regs * const regs, + struct op_msrs const * const msrs); + void (*start)(struct op_msrs const * const msrs); + void (*stop)(struct op_msrs const * const msrs); + void (*shutdown)(struct op_msrs const * const msrs); }; extern struct op_x86_model_spec const op_ppro_spec; -- cgit v1.2.3-70-g09d2 From 9063759540daac40cc1f402f83a3be6b489f8583 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 10 Mar 2009 19:15:57 +0100 Subject: x86/oprofile: remove #ifdefs in ibs functions IBS code is moved to separate functions. This allows the removal of #ifdefs in functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 80 +++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 8fdf06e4edf..b54c0880b7d 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -220,6 +220,50 @@ op_amd_handle_ibs(struct pt_regs * const regs, return 1; } +static inline void op_amd_start_ibs(void) +{ + unsigned int low, high; + if (has_ibs && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + + IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (has_ibs && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +} + +static void op_amd_stop_ibs(void) +{ + unsigned int low, high; + if (has_ibs && ibs_config.fetch_enabled) { + /* clear max count and enable */ + low = 0; + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (has_ibs && ibs_config.op_enabled) { + /* clear max count and enable */ + low = 0; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +} + +#else + +static inline int op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { } +static inline void op_amd_start_ibs(void) { } +static inline void op_amd_stop_ibs(void) { } + #endif static int op_amd_check_ctrs(struct pt_regs * const regs, @@ -238,9 +282,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } -#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); -#endif /* See op_model_ppro.c */ return 1; @@ -258,25 +300,9 @@ static void op_amd_start(struct op_msrs const * const msrs) } } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_start_ibs(); } - static void op_amd_stop(struct op_msrs const * const msrs) { unsigned int low, high; @@ -294,21 +320,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } -#ifdef CONFIG_OPROFILE_IBS - if (has_ibs && ibs_config.fetch_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (has_ibs && ibs_config.op_enabled) { - /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif + op_amd_stop_ibs(); } static void op_amd_shutdown(struct op_msrs const * const msrs) -- cgit v1.2.3-70-g09d2 From d20f24c66011f8a397bca6c5d1a6a7c7e612d2d7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sun, 11 Jan 2009 13:01:16 +0100 Subject: x86/oprofile: simplify AMD cpu init code Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index dd8515301fb..ae0ab03959b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -460,27 +460,26 @@ int __init op_nmi_init(struct oprofile_operations *ops) /* Needs to be at least an Athlon (or hammer in 32bit mode) */ switch (family) { - default: - return -ENODEV; case 6: - model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_amd_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ + /* + * Actually it could be i386/hammer too, but + * give user space an consistent name. + */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; + default: + return -ENODEV; } + model = &op_amd_spec; break; case X86_VENDOR_INTEL: -- cgit v1.2.3-70-g09d2 From ff9faa8b676e195476b86f03fe58db0f01bda8f3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 15:36:29 +0200 Subject: x86/oprofile: move common macros to op_x86_model.h There are duplicate macro implementations in model specific code. This patch moves all common macros to op_x86_model.h. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 8 -------- arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 8 -------- arch/x86/oprofile/op_x86_model.h | 9 +++++++++ 4 files changed, 9 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b54c0880b7d..4b9254a67e6 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,22 +26,14 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) #define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) #define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) #define CTRL_CLEAR_LO(x) (x &= (1<<21)) #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 819b131fd75..420c15e7123 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -366,8 +366,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) #define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index ae581196688..a922a1a815c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -26,19 +26,11 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) #define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) #define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) static u64 *reset_value; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index bd8157d12ff..c80ec7d0999 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -11,6 +11,15 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) + struct op_saved_msr { unsigned int high; unsigned int low; -- cgit v1.2.3-70-g09d2 From d2731a4387ad6c6bca07abfe9ed41d450fb6d665 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 19:47:38 +0200 Subject: x86/oprofile: remove MSR macros for AMD cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 4b9254a67e6..c6181c265ae 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,12 +26,7 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) #define CTRL_CLEAR_LO(x) (x &= (1<<21)) #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) @@ -101,17 +96,17 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_WRITE(1, msrs, i); + wrmsr(msrs->counters[i].addr, -1, -1); } /* enable active counters */ @@ -119,9 +114,9 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_WRITE(counter_config[i].count, msrs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); @@ -133,7 +128,7 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; } @@ -267,10 +262,10 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; - CTR_READ(low, high, msrs, i); + rdmsr(msrs->counters[i].addr, low, high); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); } } @@ -286,9 +281,9 @@ static void op_amd_start(struct op_msrs const * const msrs) int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } @@ -307,9 +302,9 @@ static void op_amd_stop(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } op_amd_stop_ibs(); -- cgit v1.2.3-70-g09d2 From 74c9a5c341bb1f6cbb5095b07c77230f19682ce8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 May 2009 19:47:38 +0200 Subject: x86/oprofile: remove MSR macros for ppro cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index a922a1a815c..6c5d288c566 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -27,9 +27,6 @@ static int num_counters = 2; static int counter_width = 32; #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - -#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) #define CTRL_CLEAR(x) (x &= (1<<21)) #define CTRL_SET_EVENT(val, e) (val |= e) @@ -88,9 +85,9 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -107,14 +104,14 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) wrmsrl(msrs->counters[i].addr, -reset_value[i]); - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_CLEAR(low); CTRL_SET_ENABLE(low); CTRL_SET_USR(low, counter_config[i].user); CTRL_SET_KERN(low, counter_config[i].kernel); CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT(low, counter_config[i].event); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; } @@ -162,9 +159,9 @@ static void ppro_start(struct op_msrs const * const msrs) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } } @@ -180,9 +177,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CTRL_READ(low, high, msrs, i); + rdmsr(msrs->controls[i].addr, low, high); CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); + wrmsr(msrs->controls[i].addr, low, high); } } -- cgit v1.2.3-70-g09d2 From 1131a478245b00664ae2dbc0f68db987b51fa806 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 20:23:23 +0200 Subject: x86/oprofile: remove MSR macros for p4 cpus The macros CTRL_READ() and CTRL_WRITE() make the code hard to read and maintain. This patch replaces them by rdmsr()/wrmsr() functions and simplifies the code. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_p4.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 420c15e7123..365d8a9c03d 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -350,8 +350,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) -#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -361,13 +359,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) -#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) @@ -513,7 +507,7 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) if (ev->bindings[i].virt_counter & counter_bit) { /* modify ESCR */ - ESCR_READ(escr, high, ev, i); + rdmsr(ev->bindings[i].escr_address, escr, high); ESCR_CLEAR(escr); if (stag == 0) { ESCR_SET_USR_0(escr, counter_config[ctr].user); @@ -524,10 +518,11 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } ESCR_SET_EVENT_SELECT(escr, ev->event_select); ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); - ESCR_WRITE(escr, high, ev, i); + wrmsr(ev->bindings[i].escr_address, escr, high); /* modify CCCR */ - CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); + rdmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); @@ -535,7 +530,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) CCCR_SET_PMI_OVF_0(cccr); else CCCR_SET_PMI_OVF_1(cccr); - CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); + wrmsr(p4_counters[VIRT_CTR(stag, ctr)].cccr_address, + cccr, high); return; } } @@ -582,7 +578,8 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(u32)counter_config[i].count, -1); } else { reset_value[i] = 0; } @@ -622,14 +619,16 @@ static int p4_check_ctrs(struct pt_regs * const regs, real = VIRT_CTR(stag, i); - CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); + rdmsr(p4_counters[real].cccr_address, low, high); + rdmsr(p4_counters[real].counter_address, ctr, high); if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].counter_address, + -(u32)reset_value[i], -1); CCCR_CLEAR_OVF(low); - CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + wrmsr(p4_counters[real].cccr_address, low, high); + wrmsr(p4_counters[real].counter_address, + -(u32)reset_value[i], -1); } } @@ -651,9 +650,9 @@ static void p4_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_ENABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } @@ -668,9 +667,9 @@ static void p4_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - CCCR_READ(low, high, VIRT_CTR(stag, i)); + rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_SET_DISABLE(low); - CCCR_WRITE(low, high, VIRT_CTR(stag, i)); + wrmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); } } -- cgit v1.2.3-70-g09d2 From ec064c093e254f4433afb17dcef7f964c76436af Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 15:05:50 +0200 Subject: x86/oprofile: fix and cleanup CTRL_SET_* macros This patch fixes missing braces around macro parameters. Macro definitions from intel_arch_perfmon.h are used where possible. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 1 - arch/x86/oprofile/op_x86_model.h | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 6c5d288c566..61ee8f64053 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "op_x86_model.h" #include "op_counter.h" diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index c80ec7d0999..a207b1c46e2 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -11,14 +11,16 @@ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#include + +#define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) +#define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) +#define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) +#define CTRL_SET_ENABLE(val) ((val) |= ARCH_PERFMON_EVENTSEL_INT) +#define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) +#define CTRL_SET_KERN(val, k) ((val) |= ((k) ? ARCH_PERFMON_EVENTSEL_OS : 0)) +#define CTRL_SET_USR(val, u) ((val) |= ((u) ? ARCH_PERFMON_EVENTSEL_USR : 0)) +#define CTRL_SET_UM(val, m) ((val) |= ((m) << 8)) struct op_saved_msr { unsigned int high; -- cgit v1.2.3-70-g09d2 From 9c59354b48ce9cf28048b02fea73dd0236f876ea Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 18:16:43 +0200 Subject: x86/oprofile: remove unused macros for AMD virtualization profiling The use of the macros has no effect. The oprofilefs has to be extended first to support these features. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c6181c265ae..aaa7ffaed6b 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -31,8 +31,6 @@ #define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) #define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) #define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; @@ -125,9 +123,6 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) CTRL_SET_UM(low, counter_config[i].unit_mask); CTRL_SET_EVENT_LOW(low, counter_config[i].event); CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - wrmsr(msrs->controls[i].addr, low, high); } else { reset_value[i] = 0; -- cgit v1.2.3-70-g09d2 From ef8828ddf828174785421af67c281144d4b8e796 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 19:31:44 +0200 Subject: x86/oprofile: pass the model to setup_ctrs() functions In follow-on patches the setup_ctrs() functions will need data that describes the model. This patch extends the function argument list to pass a pointer of the model to these function. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 2 +- arch/x86/oprofile/op_model_amd.c | 3 ++- arch/x86/oprofile/op_model_p4.c | 3 ++- arch/x86/oprofile/op_model_ppro.c | 3 ++- arch/x86/oprofile/op_x86_model.h | 3 ++- 5 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index ae0ab03959b..c31f87bbf43 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -125,7 +125,7 @@ static void nmi_cpu_setup(void *dummy) int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); spin_lock(&oprofilefs_lock); - model->setup_ctrs(msrs); + model->setup_ctrs(model, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index aaa7ffaed6b..86e0a01ba12 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -85,7 +85,8 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) } -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int low, high; int i; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 365d8a9c03d..05ba0287b1f 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -542,7 +542,8 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) } -static void p4_setup_ctrs(struct op_msrs const * const msrs) +static void p4_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int i; unsigned int low, high; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 61ee8f64053..40b44ee521d 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -51,7 +51,8 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) } -static void ppro_setup_ctrs(struct op_msrs const * const msrs) +static void ppro_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) { unsigned int low, high; int i; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index a207b1c46e2..6161c7f0e7f 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -48,7 +48,8 @@ struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); void (*fill_in_addresses)(struct op_msrs * const msrs); - void (*setup_ctrs)(struct op_msrs const * const msrs); + void (*setup_ctrs)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); -- cgit v1.2.3-70-g09d2 From 3370d358569755625aba4d9a846a040ce691d9ed Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 15:10:32 +0200 Subject: x86/oprofile: replace macros to calculate control register This patch introduces op_x86_get_ctrl() to calculate the value of the performance control register. This is generic code usable for all models. The event and reserved masks are model specific and stored in struct op_x86_model_spec. 64 bit MSR functions are used now. The patch removes many hard to read macros used for ctrl calculation. The function op_x86_get_ctrl() is common code and the first step to further merge performance counter implementations for x86 models. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 20 +++++++++++++++++++ arch/x86/oprofile/op_model_amd.c | 41 +++++++++++++++------------------------ arch/x86/oprofile/op_model_ppro.c | 29 +++++++++++++-------------- arch/x86/oprofile/op_x86_model.h | 15 ++++++++++---- 4 files changed, 60 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c31f87bbf43..388ee15e0e4 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -31,6 +31,26 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +/* common functions */ + +u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config) +{ + u64 val = 0; + u16 event = (u16)counter_config->event; + + val |= ARCH_PERFMON_EVENTSEL_INT; + val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; + val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; + val |= (counter_config->unit_mask & 0xFF) << 8; + event &= model->event_mask ? model->event_mask : 0xFF; + val |= event & 0xFF; + val |= (event & 0x0F00) << 24; + + return val; +} + + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 86e0a01ba12..2406ab86360 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -25,12 +25,11 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#define OP_EVENT_MASK 0x0FFF + +#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) #define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) static unsigned long reset_value[NUM_COUNTERS]; @@ -84,21 +83,19 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) } } - static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -112,19 +109,11 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < NUM_COUNTERS; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); - - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -486,14 +475,16 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .reserved = MSR_AMD_EVENTSEL_RESERVED, + .event_mask = OP_EVENT_MASK, + .init = op_amd_init, + .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .shutdown = &op_amd_shutdown, }; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 40b44ee521d..3092f998baf 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -10,6 +10,7 @@ * @author Philippe Elie * @author Graydon Hoare * @author Andi Kleen + * @author Robert Richter */ #include @@ -26,8 +27,8 @@ static int num_counters = 2; static int counter_width = 32; #define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) -#define CTRL_CLEAR(x) (x &= (1<<21)) -#define CTRL_SET_EVENT(val, e) (val |= e) + +#define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -54,7 +55,7 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs) static void ppro_setup_ctrs(struct op_x86_model_spec const *model, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) { @@ -85,9 +86,9 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); } /* avoid a false detection of ctr overflows in NMI handler */ @@ -101,17 +102,11 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - - rdmsr(msrs->controls[i].addr, low, high); - CTRL_CLEAR(low); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT(low, counter_config[i].event); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[i]); + wrmsrl(msrs->controls[i].addr, val); } else { reset_value[i] = 0; } @@ -205,6 +200,7 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = 2, .num_controls = 2, + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -249,6 +245,7 @@ static int arch_perfmon_init(struct oprofile_operations *ignore) } struct op_x86_model_spec op_arch_perfmon_spec = { + .reserved = MSR_PPRO_EVENTSEL_RESERVED, .init = &arch_perfmon_init, /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 6161c7f0e7f..3220d4ce632 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -6,21 +6,19 @@ * @remark Read the file COPYING * * @author Graydon Hoare + * @author Robert Richter */ #ifndef OP_X86_MODEL_H #define OP_X86_MODEL_H +#include #include #define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) #define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) #define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_ENABLE(val) ((val) |= ARCH_PERFMON_EVENTSEL_INT) #define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_KERN(val, k) ((val) |= ((k) ? ARCH_PERFMON_EVENTSEL_OS : 0)) -#define CTRL_SET_USR(val, u) ((val) |= ((u) ? ARCH_PERFMON_EVENTSEL_USR : 0)) -#define CTRL_SET_UM(val, m) ((val) |= ((m) << 8)) struct op_saved_msr { unsigned int high; @@ -39,12 +37,16 @@ struct op_msrs { struct pt_regs; +struct oprofile_operations; + /* The model vtable abstracts the differences between * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; + u64 reserved; + u16 event_mask; int (*init)(struct oprofile_operations *ops); void (*exit)(void); void (*fill_in_addresses)(struct op_msrs * const msrs); @@ -57,6 +59,11 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; +struct op_counter_config; + +extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, + struct op_counter_config *counter_config); + extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -- cgit v1.2.3-70-g09d2 From 42399adb239d4f1413899cc618ecf640779e79df Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 17:59:06 +0200 Subject: x86/oprofile: replace CTR_OVERFLOWED macros The patch replaces all CTR_OVERFLOWED macros. 64 bit MSR functions and 64 bit counter values are used now. Thus, it will be easier to later extend the models to use more than 32 bit width counters. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 16 ++++++++-------- arch/x86/oprofile/op_model_p4.c | 6 +++--- arch/x86/oprofile/op_model_ppro.c | 10 ++++------ 3 files changed, 15 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 2406ab86360..b5d678fbf03 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -26,11 +26,10 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 #define OP_EVENT_MASK 0x0FFF +#define OP_CTR_OVERFLOW (1ULL<<31) #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - static unsigned long reset_value[NUM_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS @@ -241,17 +240,18 @@ static inline void op_amd_stop_ibs(void) { } static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->counters[i].addr, low, high); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); - } + rdmsrl(msrs->counters[i].addr, val); + /* bit is clear if overflowed: */ + if (val & OP_CTR_OVERFLOW) + continue; + oprofile_add_sample(regs, i); + wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 05ba0287b1f..ac4ca28b9ed 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -32,6 +32,8 @@ #define NUM_CCCRS_HT2 9 #define NUM_CONTROLS_HT2 (NUM_ESCRS_HT2 + NUM_CCCRS_HT2) +#define OP_CTR_OVERFLOW (1ULL<<31) + static unsigned int num_counters = NUM_COUNTERS_NON_HT; static unsigned int num_controls = NUM_CONTROLS_NON_HT; @@ -362,8 +364,6 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) - /* this assigns a "stagger" to the current CPU, which is used throughout the code in this module as an extra array offset, to select the "even" @@ -622,7 +622,7 @@ static int p4_check_ctrs(struct pt_regs * const regs, rdmsr(p4_counters[real].cccr_address, low, high); rdmsr(p4_counters[real].counter_address, ctr, high); - if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { + if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); wrmsr(p4_counters[real].counter_address, -(u32)reset_value[i], -1); diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 3092f998baf..82db396dc3e 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -26,8 +26,6 @@ static int num_counters = 2; static int counter_width = 32; -#define CTR_OVERFLOWED(n) (!((n) & (1ULL<<(counter_width-1)))) - #define MSR_PPRO_EVENTSEL_RESERVED ((0xFFFFFFFFULL<<32)|(1ULL<<21)) static u64 *reset_value; @@ -124,10 +122,10 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); - if (CTR_OVERFLOWED(val)) { - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -reset_value[i]); - } + if (val & (1ULL << (counter_width - 1))) + continue; + oprofile_add_sample(regs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } /* Only P6 based Pentium M need to re-unmask the apic vector but it -- cgit v1.2.3-70-g09d2 From dea3766ca052a4f572b16a23a322553c064d75af Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 18:11:52 +0200 Subject: x86/oprofile: replace CTRL_SET_*ACTIVE macros The patch replaces all CTRL_SET_*ACTIVE macros. 64 bit MSR functions and 64 bit counter values are used now. The code uses bit masks from . Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 16 ++++++++-------- arch/x86/oprofile/op_model_ppro.c | 16 ++++++++-------- arch/x86/oprofile/op_x86_model.h | 2 -- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b5d678fbf03..4ac9d283e8d 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -262,13 +262,13 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, static void op_amd_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (reset_value[i]) { - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_ACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } @@ -277,7 +277,7 @@ static void op_amd_start(struct op_msrs const * const msrs) static void op_amd_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; /* @@ -287,9 +287,9 @@ static void op_amd_stop(struct op_msrs const * const msrs) for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_INACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } op_amd_stop_ibs(); diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 82db396dc3e..566b43f0b6c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -145,16 +145,16 @@ static int ppro_check_ctrs(struct pt_regs * const regs, static void ppro_start(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) return; for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_ACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } } @@ -162,7 +162,7 @@ static void ppro_start(struct op_msrs const * const msrs) static void ppro_stop(struct op_msrs const * const msrs) { - unsigned int low, high; + u64 val; int i; if (!reset_value) @@ -170,9 +170,9 @@ static void ppro_stop(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; - rdmsr(msrs->controls[i].addr, low, high); - CTRL_SET_INACTIVE(low); - wrmsr(msrs->controls[i].addr, low, high); + rdmsrl(msrs->controls[i].addr, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 3220d4ce632..1c4577795a9 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -17,8 +17,6 @@ #define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) #define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) -#define CTRL_SET_ACTIVE(val) ((val) |= ARCH_PERFMON_EVENTSEL0_ENABLE) -#define CTRL_SET_INACTIVE(val) ((val) &= ~ARCH_PERFMON_EVENTSEL0_ENABLE) struct op_saved_msr { unsigned int high; -- cgit v1.2.3-70-g09d2 From 217d3cfb959756cb493fc03106c0253baa420ce8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 4 Jun 2009 02:36:44 +0200 Subject: x86/oprofile: replace CTR*_IS_RESERVED macros The patch replaces all CTR*_IS_RESERVED macros. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 10 +++++----- arch/x86/oprofile/op_model_p4.c | 10 +++++----- arch/x86/oprofile/op_model_ppro.c | 10 +++++----- arch/x86/oprofile/op_x86_model.h | 3 --- 4 files changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 4ac9d283e8d..c5c5eec2fa7 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -90,7 +90,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -99,14 +99,14 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsr(msrs->counters[i].addr, -1, -1); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); rdmsrl(msrs->controls[i].addr, val); @@ -300,11 +300,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index ac4ca28b9ed..9db0ca9af76 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -559,7 +559,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -569,14 +569,14 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, @@ -679,7 +679,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } /* @@ -688,7 +688,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) * This saves a few bits. */ for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 566b43f0b6c..0a261a5c696 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -82,7 +82,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* clear all counters */ for (i = 0 ; i < num_counters; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; @@ -91,14 +91,14 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, /* avoid a false detection of ctr overflows in NMI handler */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) + if (unlikely(!msrs->counters[i].addr)) continue; wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < num_counters; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsrl(msrs->counters[i].addr, -reset_value[i]); rdmsrl(msrs->controls[i].addr, val); @@ -181,11 +181,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) + if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } for (i = 0 ; i < num_counters ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) + if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } if (reset_value) { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 1c4577795a9..69f1eb46e1b 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -15,9 +15,6 @@ #include #include -#define CTR_IS_RESERVED(msrs, c) ((msrs)->counters[(c)].addr ? 1 : 0) -#define CTRL_IS_RESERVED(msrs, c) ((msrs)->controls[(c)].addr ? 1 : 0) - struct op_saved_msr { unsigned int high; unsigned int low; -- cgit v1.2.3-70-g09d2 From bbc5986d2db427fdd61b6116ff8b9ed988e663a8 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 25 May 2009 17:38:19 +0200 Subject: x86/oprofile: use 64 bit wrmsr functions This patch replaces some wrmsr() functions with wrmsrl(). Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 ++++--- arch/x86/oprofile/op_model_p4.c | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index c5c5eec2fa7..9bf90176241 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -101,14 +101,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!msrs->counters[i].addr)) continue; - wrmsr(msrs->counters[i].addr, -1, -1); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; - wrmsr(msrs->counters[i].addr, -(unsigned int)counter_config[i].count, -1); + wrmsrl(msrs->counters[i].addr, + -(s64)counter_config[i].count); rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; val |= op_x86_get_ctrl(model, &counter_config[i]); @@ -251,7 +252,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, if (val & OP_CTR_OVERFLOW) continue; oprofile_add_sample(regs, i); - wrmsr(msrs->counters[i].addr, -(unsigned int)reset_value[i], -1); + wrmsrl(msrs->counters[i].addr, -(s64)reset_value[i]); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9db0ca9af76..f01e53b118f 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -579,8 +579,8 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); - wrmsr(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(u32)counter_config[i].count, -1); + wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, + -(s64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -624,12 +624,12 @@ static int p4_check_ctrs(struct pt_regs * const regs, rdmsr(p4_counters[real].counter_address, ctr, high); if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); - wrmsr(p4_counters[real].counter_address, - -(u32)reset_value[i], -1); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); CCCR_CLEAR_OVF(low); wrmsr(p4_counters[real].cccr_address, low, high); - wrmsr(p4_counters[real].counter_address, - -(u32)reset_value[i], -1); + wrmsrl(p4_counters[real].counter_address, + -(s64)reset_value[i]); } } -- cgit v1.2.3-70-g09d2 From 95e74e62c1540b1115fe8cec5b592f22960f2bb2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 19:09:27 +0200 Subject: x86/oprofile: use 64 bit values to save MSR states This patch removes struct op_saved_msr and replaces it by an u64 variable. This makes code easier and it is possible to use 64 bit MSR functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 28 ++++++++-------------------- arch/x86/oprofile/op_x86_model.h | 9 ++------- 2 files changed, 10 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 388ee15e0e4..3b84b789de0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -78,19 +78,13 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) unsigned int i; for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - rdmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + if (counters[i].addr) + rdmsrl(counters[i].addr, counters[i].saved); } for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - rdmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + if (controls[i].addr) + rdmsrl(controls[i].addr, controls[i].saved); } } @@ -204,19 +198,13 @@ static void nmi_restore_registers(struct op_msrs *msrs) unsigned int i; for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr) { - wrmsr(controls[i].addr, - controls[i].saved.low, - controls[i].saved.high); - } + if (controls[i].addr) + wrmsrl(controls[i].addr, controls[i].saved); } for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr) { - wrmsr(counters[i].addr, - counters[i].saved.low, - counters[i].saved.high); - } + if (counters[i].addr) + wrmsrl(counters[i].addr, counters[i].saved); } } diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 69f1eb46e1b..fda52b4c1b9 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -15,14 +15,9 @@ #include #include -struct op_saved_msr { - unsigned int high; - unsigned int low; -}; - struct op_msr { - unsigned long addr; - struct op_saved_msr saved; + unsigned long addr; + u64 saved; }; struct op_msrs { -- cgit v1.2.3-70-g09d2 From 1a245c45343651a87ff63afc5ddeb8e24d731835 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Jun 2009 15:54:24 +0200 Subject: x86/oprofile: remove some local variables in MSR save/restore functions The patch removes some local variables in these functions. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3b84b789de0..80b63d5db50 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -71,18 +71,16 @@ static int profile_exceptions_notify(struct notifier_block *self, static void nmi_cpu_save_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrs; ++i) { + for (i = 0; i < model->num_counters; ++i) { if (counters[i].addr) rdmsrl(counters[i].addr, counters[i].saved); } - for (i = 0; i < nr_ctrls; ++i) { + for (i = 0; i < model->num_controls; ++i) { if (controls[i].addr) rdmsrl(controls[i].addr, controls[i].saved); } @@ -191,18 +189,16 @@ static int nmi_setup(void) static void nmi_restore_registers(struct op_msrs *msrs) { - unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; unsigned int i; - for (i = 0; i < nr_ctrls; ++i) { + for (i = 0; i < model->num_controls; ++i) { if (controls[i].addr) wrmsrl(controls[i].addr, controls[i].saved); } - for (i = 0; i < nr_ctrs; ++i) { + for (i = 0; i < model->num_counters; ++i) { if (counters[i].addr) wrmsrl(counters[i].addr, counters[i].saved); } -- cgit v1.2.3-70-g09d2 From c572ae4efd1b0a5cc76c5e6aae05c1b182b6a69c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 20:10:39 +0200 Subject: x86/oprofile: use 64 bit values in IBS functions The IBS code internally uses 32 bit values (a low and a high value) to represent a 64 bit value. This patch changes this and now 64 bit values are used instead. 64 bit MSR functions can be used now. No functional changes. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 131 ++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 70 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 9bf90176241..6493ef7ae9a 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -35,16 +35,18 @@ static unsigned long reset_value[NUM_COUNTERS]; #ifdef CONFIG_OPROFILE_IBS /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL /*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_FETCH_SIZE 6 -#define IBS_OP_SIZE 12 +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 static int has_ibs; /* AMD Family10h and later */ @@ -126,66 +128,63 @@ static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - u32 low, high; - u64 msr; + u64 val, ctl; struct op_entry entry; if (!has_ibs) return 1; if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); + if (ctl & IBS_FETCH_VAL) { + rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); + oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - oprofile_add_data(&entry, low); - oprofile_add_data(&entry, high); - rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data(&entry, (u32)ctl); + oprofile_add_data(&entry, (u32)(ctl >> 32)); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); oprofile_write_commit(&entry); /* reenable the IRQ */ - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } } if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsrl(MSR_AMD64_IBSOPRIP, msr); - oprofile_write_reserve(&entry, regs, msr, + rdmsrl(MSR_AMD64_IBSOPCTL, ctl); + if (ctl & IBS_OP_VAL) { + rdmsrl(MSR_AMD64_IBSOPRIP, val); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA2, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSOPDATA3, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCLINAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); - rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); - oprofile_add_data(&entry, (u32)msr); - oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA2, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA3, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSDCLINAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); + oprofile_add_data(&entry, (u32)val); + oprofile_add_data(&entry, (u32)(val >> 32)); oprofile_write_commit(&entry); /* reenable the IRQ */ - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; + ctl |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -194,39 +193,31 @@ op_amd_handle_ibs(struct pt_regs * const regs, static inline void op_amd_start_ibs(void) { - unsigned int low, high; + u64 val; if (has_ibs && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ - + IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; + val |= IBS_FETCH_ENABLE; + wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } if (has_ibs && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) - + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ - + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); + val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void op_amd_stop_ibs(void) { - unsigned int low, high; - if (has_ibs && ibs_config.fetch_enabled) { + if (has_ibs && ibs_config.fetch_enabled) /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) { + if (has_ibs && ibs_config.op_enabled) /* clear max count and enable */ - low = 0; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } + wrmsrl(MSR_AMD64_IBSOPCTL, 0); } #else -- cgit v1.2.3-70-g09d2 From 51563a0e5650d0d76539625388d72d62b34c726e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 3 Jun 2009 20:54:56 +0200 Subject: x86/oprofile: introduce oprofile_add_data64() The IBS implemention writes 64 bit register values to the cpu buffer by writing two 32 values using oprofile_add_data(). This patch introduces oprofile_add_data64() to write a single 64 bit value to the buffer. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 27 +++++++++------------------ drivers/oprofile/cpu_buffer.c | 15 +++++++++++++++ include/linux/oprofile.h | 1 + 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6493ef7ae9a..cc930467575 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -140,13 +140,10 @@ op_amd_handle_ibs(struct pt_regs * const regs, rdmsrl(MSR_AMD64_IBSFETCHLINAD, val); oprofile_write_reserve(&entry, regs, val, IBS_FETCH_CODE, IBS_FETCH_SIZE); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); - oprofile_add_data(&entry, (u32)ctl); - oprofile_add_data(&entry, (u32)(ctl >> 32)); + oprofile_add_data64(&entry, val); + oprofile_add_data64(&entry, ctl); rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ @@ -162,23 +159,17 @@ op_amd_handle_ibs(struct pt_regs * const regs, rdmsrl(MSR_AMD64_IBSOPRIP, val); oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, IBS_OP_SIZE); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA2, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA3, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCLINAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); - oprofile_add_data(&entry, (u32)val); - oprofile_add_data(&entry, (u32)(val >> 32)); + oprofile_add_data64(&entry, val); oprofile_write_commit(&entry); /* reenable the IRQ */ diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 50640cc5eef..a7aae24f288 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -406,6 +406,21 @@ int oprofile_add_data(struct op_entry *entry, unsigned long val) return op_cpu_buffer_add_data(entry, val); } +int oprofile_add_data64(struct op_entry *entry, u64 val) +{ + if (!entry->event) + return 0; + if (op_cpu_buffer_get_size(entry) < 2) + /* + * the function returns 0 to indicate a too small + * buffer, even if there is some space left + */ + return 0; + if (!op_cpu_buffer_add_data(entry, (u32)val)) + return 0; + return op_cpu_buffer_add_data(entry, (u32)(val >> 32)); +} + int oprofile_write_commit(struct op_entry *entry) { if (!entry->event) diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index dbbe2dbc441..d68d2ed94f1 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -179,6 +179,7 @@ void oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, unsigned long pc, int code, int size); int oprofile_add_data(struct op_entry *entry, unsigned long val); +int oprofile_add_data64(struct op_entry *entry, u64 val); int oprofile_write_commit(struct op_entry *entry); #endif /* OPROFILE_H */ -- cgit v1.2.3-70-g09d2 From 802070f5474af1a49435a9528aede47bb18abd47 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 12 Jun 2009 18:32:07 +0200 Subject: x86/oprofile: fix initialization of arch_perfmon for core_i7 Commit: e419294 x86/oprofile: moving arch_perfmon counter setup to op_x86_model_spec.init introduced a bug in the initialization of core_i7 leading to the incorrect model setup to &op_ppro_spec. This patch fixes this. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7826dfcc842..28ee490c1b8 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -406,6 +406,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + struct op_x86_model_spec const *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; @@ -432,7 +433,7 @@ static int __init ppro_init(char **cpu_type) *cpu_type = "i386/core_2"; break; case 26: - model = &op_arch_perfmon_spec; + spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; case 28: @@ -443,7 +444,7 @@ static int __init ppro_init(char **cpu_type) return 0; } - model = &op_ppro_spec; + model = spec; return 1; } -- cgit v1.2.3-70-g09d2 From c64b04fe6e0cb7c78e01751a44ef56cf20344e87 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 14 Jun 2009 00:59:50 +0530 Subject: x86, cpu: cpu/proc.c display cache alignment and address sizes for 32 bit 32 bits can also access x86_cache_alignment, x86_phys_bits and x86_virt_bits, make them available to user space just as on 64 bits. Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1244921390.11733.30.camel@ht.satnam> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/proc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246..f82706a3901 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -116,11 +116,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); #endif seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); -#ifdef CONFIG_X86_64 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); -#endif seq_printf(m, "power management:"); for (i = 0; i < 32; i++) { -- cgit v1.2.3-70-g09d2 From ac5672f82c39ff2f8dce81bf3e68b1dfc41f366f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Apr 2009 14:29:44 -0700 Subject: x86/paravirt: split paravirt definitions into paravirt_types.h Split the monolithic asm/paravirt.h into separate paravirt.h (inlines and other "active" definitions), and paravirt_types.h (types, constants and other "passive" definitions). This makes it easier to use the type/constant definitions without pulling in everything else and causing circular dependency problems. [ Impact: cleanup ] Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/paravirt.h | 711 +-------------------------------- arch/x86/include/asm/paravirt_types.h | 720 ++++++++++++++++++++++++++++++++++ 2 files changed, 721 insertions(+), 710 deletions(-) create mode 100644 arch/x86/include/asm/paravirt_types.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a083..6a07af432c8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -7,689 +7,11 @@ #include #include -/* Bitmask of what can be clobbered: usually at least eax. */ -#define CLBR_NONE 0 -#define CLBR_EAX (1 << 0) -#define CLBR_ECX (1 << 1) -#define CLBR_EDX (1 << 2) -#define CLBR_EDI (1 << 3) - -#ifdef CONFIG_X86_32 -/* CLBR_ANY should match all regs platform has. For i386, that's just it */ -#define CLBR_ANY ((1 << 4) - 1) - -#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) -#define CLBR_SCRATCH (0) -#else -#define CLBR_RAX CLBR_EAX -#define CLBR_RCX CLBR_ECX -#define CLBR_RDX CLBR_EDX -#define CLBR_RDI CLBR_EDI -#define CLBR_RSI (1 << 4) -#define CLBR_R8 (1 << 5) -#define CLBR_R9 (1 << 6) -#define CLBR_R10 (1 << 7) -#define CLBR_R11 (1 << 8) - -#define CLBR_ANY ((1 << 9) - 1) - -#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ - CLBR_RCX | CLBR_R8 | CLBR_R9) -#define CLBR_RET_REG (CLBR_RAX) -#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) - -#include -#endif /* X86_64 */ - -#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) +#include #ifndef __ASSEMBLY__ #include #include -#include -#include - -struct page; -struct thread_struct; -struct desc_ptr; -struct tss_struct; -struct mm_struct; -struct desc_struct; -struct task_struct; - -/* - * Wrapper type for pointers to code which uses the non-standard - * calling convention. See PV_CALL_SAVE_REGS_THUNK below. - */ -struct paravirt_callee_save { - void *func; -}; - -/* general info */ -struct pv_info { - unsigned int kernel_rpl; - int shared_kernel_pmd; - int paravirt_enabled; - const char *name; -}; - -struct pv_init_ops { - /* - * Patch may replace one of the defined code sequences with - * arbitrary code, subject to the same register constraints. - * This generally means the code is not free to clobber any - * registers other than EAX. The patch function should return - * the number of bytes of code generated, as we nop pad the - * rest in generic code. - */ - unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, - unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*arch_setup)(void); - char *(*memory_setup)(void); - void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); -}; - - -struct pv_lazy_ops { - /* Set deferred update mode, used for batching operations. */ - void (*enter)(void); - void (*leave)(void); -}; - -struct pv_time_ops { - void (*time_init)(void); - - /* Set and set time of day */ - unsigned long (*get_wallclock)(void); - int (*set_wallclock)(unsigned long); - - unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); -}; - -struct pv_cpu_ops { - /* hooks for various privileged instructions */ - unsigned long (*get_debugreg)(int regno); - void (*set_debugreg)(int regno, unsigned long value); - - void (*clts)(void); - - unsigned long (*read_cr0)(void); - void (*write_cr0)(unsigned long); - - unsigned long (*read_cr4_safe)(void); - unsigned long (*read_cr4)(void); - void (*write_cr4)(unsigned long); - -#ifdef CONFIG_X86_64 - unsigned long (*read_cr8)(void); - void (*write_cr8)(unsigned long); -#endif - - /* Segment descriptor handling */ - void (*load_tr_desc)(void); - void (*load_gdt)(const struct desc_ptr *); - void (*load_idt)(const struct desc_ptr *); - void (*store_gdt)(struct desc_ptr *); - void (*store_idt)(struct desc_ptr *); - void (*set_ldt)(const void *desc, unsigned entries); - unsigned long (*store_tr)(void); - void (*load_tls)(struct thread_struct *t, unsigned int cpu); -#ifdef CONFIG_X86_64 - void (*load_gs_index)(unsigned int idx); -#endif - void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, - const void *desc); - void (*write_gdt_entry)(struct desc_struct *, - int entrynum, const void *desc, int size); - void (*write_idt_entry)(gate_desc *, - int entrynum, const gate_desc *gate); - void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); - void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); - - void (*set_iopl_mask)(unsigned mask); - - void (*wbinvd)(void); - void (*io_delay)(void); - - /* cpuid emulation, mostly so that caps bits can be disabled */ - void (*cpuid)(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx); - - /* MSR, PMC and TSR operations. - err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); - u64 (*read_msr)(unsigned int msr, int *err); - int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - - u64 (*read_tsc)(void); - u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); - - /* - * Atomically enable interrupts and return to userspace. This - * is only ever used to return to 32-bit processes; in a - * 64-bit kernel, it's used for 32-on-64 compat processes, but - * never native 64-bit processes. (Jump, not call.) - */ - void (*irq_enable_sysexit)(void); - - /* - * Switch to usermode gs and return to 64-bit usermode using - * sysret. Only used in 64-bit kernels to return to 64-bit - * processes. Usermode register state, including %rsp, must - * already be restored. - */ - void (*usergs_sysret64)(void); - - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - - /* Normal iret. Jump to this with the standard iret stack - frame set up. */ - void (*iret)(void); - - void (*swapgs)(void); - - void (*start_context_switch)(struct task_struct *prev); - void (*end_context_switch)(struct task_struct *next); -}; - -struct pv_irq_ops { - void (*init_IRQ)(void); - - /* - * Get/set interrupt state. save_fl and restore_fl are only - * expected to use X86_EFLAGS_IF; all other bits - * returned from save_fl are undefined, and may be ignored by - * restore_fl. - * - * NOTE: These functions callers expect the callee to preserve - * more registers than the standard C calling convention. - */ - struct paravirt_callee_save save_fl; - struct paravirt_callee_save restore_fl; - struct paravirt_callee_save irq_disable; - struct paravirt_callee_save irq_enable; - - void (*safe_halt)(void); - void (*halt)(void); - -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif -}; - -struct pv_apic_ops { -#ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - - void (*startup_ipi_hook)(int phys_apicid, - unsigned long start_eip, - unsigned long start_esp); -#endif -}; - -struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - - unsigned long (*read_cr2)(void); - void (*write_cr2)(unsigned long); - - unsigned long (*read_cr3)(void); - void (*write_cr3)(unsigned long); - - /* - * Hooks for intercepting the creation/use/destruction of an - * mm_struct. - */ - void (*activate_mm)(struct mm_struct *prev, - struct mm_struct *next); - void (*dup_mmap)(struct mm_struct *oldmm, - struct mm_struct *mm); - void (*exit_mmap)(struct mm_struct *mm); - - - /* TLB operations */ - void (*flush_tlb_user)(void); - void (*flush_tlb_kernel)(void); - void (*flush_tlb_single)(unsigned long addr); - void (*flush_tlb_others)(const struct cpumask *cpus, - struct mm_struct *mm, - unsigned long va); - - /* Hooks for allocating and freeing a pagetable top-level */ - int (*pgd_alloc)(struct mm_struct *mm); - void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); - - /* - * Hooks for allocating/releasing pagetable pages when they're - * attached to a pagetable - */ - void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); - void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); - void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); - void (*release_pte)(unsigned long pfn); - void (*release_pmd)(unsigned long pfn); - void (*release_pud)(unsigned long pfn); - - /* Pagetable manipulation functions */ - void (*set_pte)(pte_t *ptep, pte_t pteval); - void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - - pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - - struct paravirt_callee_save pte_val; - struct paravirt_callee_save make_pte; - - struct paravirt_callee_save pgd_val; - struct paravirt_callee_save make_pgd; - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE - void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); - void (*pte_clear)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - void (*pmd_clear)(pmd_t *pmdp); - -#endif /* CONFIG_X86_PAE */ - - void (*set_pud)(pud_t *pudp, pud_t pudval); - - struct paravirt_callee_save pmd_val; - struct paravirt_callee_save make_pmd; - -#if PAGETABLE_LEVELS == 4 - struct paravirt_callee_save pud_val; - struct paravirt_callee_save make_pud; - - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* PAGETABLE_LEVELS == 4 */ -#endif /* PAGETABLE_LEVELS >= 3 */ - -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - - struct pv_lazy_ops lazy_mode; - - /* dom0 ops */ - - /* Sometimes the physical address is a pfn, and sometimes its - an mfn. We can tell which is which from the index. */ - void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, - phys_addr_t phys, pgprot_t flags); -}; - -struct raw_spinlock; -struct pv_lock_ops { - int (*spin_is_locked)(struct raw_spinlock *lock); - int (*spin_is_contended)(struct raw_spinlock *lock); - void (*spin_lock)(struct raw_spinlock *lock); - void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct raw_spinlock *lock); - void (*spin_unlock)(struct raw_spinlock *lock); -}; - -/* This contains all the paravirt structures: we get a convenient - * number for each function using the offset which we use to indicate - * what to patch. */ -struct paravirt_patch_template { - struct pv_init_ops pv_init_ops; - struct pv_time_ops pv_time_ops; - struct pv_cpu_ops pv_cpu_ops; - struct pv_irq_ops pv_irq_ops; - struct pv_apic_ops pv_apic_ops; - struct pv_mmu_ops pv_mmu_ops; - struct pv_lock_ops pv_lock_ops; -}; - -extern struct pv_info pv_info; -extern struct pv_init_ops pv_init_ops; -extern struct pv_time_ops pv_time_ops; -extern struct pv_cpu_ops pv_cpu_ops; -extern struct pv_irq_ops pv_irq_ops; -extern struct pv_apic_ops pv_apic_ops; -extern struct pv_mmu_ops pv_mmu_ops; -extern struct pv_lock_ops pv_lock_ops; - -#define PARAVIRT_PATCH(x) \ - (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) - -#define paravirt_type(op) \ - [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ - [paravirt_opptr] "i" (&(op)) -#define paravirt_clobber(clobber) \ - [paravirt_clobber] "i" (clobber) - -/* - * Generate some code, and mark it as patchable by the - * apply_paravirt() alternate instruction patcher. - */ -#define _paravirt_alt(insn_string, type, clobber) \ - "771:\n\t" insn_string "\n" "772:\n" \ - ".pushsection .parainstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR " 771b\n" \ - " .byte " type "\n" \ - " .byte 772b-771b\n" \ - " .short " clobber "\n" \ - ".popsection\n" - -/* Generate patchable code, with the default asm parameters. */ -#define paravirt_alt(insn_string) \ - _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") - -/* Simple instruction patching code. */ -#define DEF_NATIVE(ops, name, code) \ - extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") - -unsigned paravirt_patch_nop(void); -unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_ignore(unsigned len); -unsigned paravirt_patch_call(void *insnbuf, - const void *target, u16 tgt_clobbers, - unsigned long addr, u16 site_clobbers, - unsigned len); -unsigned paravirt_patch_jmp(void *insnbuf, const void *target, - unsigned long addr, unsigned len); -unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len); - -int paravirt_disable_iospace(void); - -/* - * This generates an indirect call based on the operation type number. - * The type number, computed in PARAVIRT_PATCH, is derived from the - * offset into the paravirt_patch_template structure, and can therefore be - * freely converted back into a structure offset. - */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" - -/* - * These macros are intended to wrap calls through one of the paravirt - * ops structs, so that they can be later identified and patched at - * runtime. - * - * Normally, a call to a pv_op function is a simple indirect call: - * (pv_op_struct.operations)(args...). - * - * Unfortunately, this is a relatively slow operation for modern CPUs, - * because it cannot necessarily determine what the destination - * address is. In this case, the address is a runtime constant, so at - * the very least we can patch the call to e a simple direct call, or - * ideally, patch an inline implementation into the callsite. (Direct - * calls are essentially free, because the call and return addresses - * are completely predictable.) - * - * For i386, these macros rely on the standard gcc "regparm(3)" calling - * convention, in which the first three arguments are placed in %eax, - * %edx, %ecx (in that order), and the remaining arguments are placed - * on the stack. All caller-save registers (eax,edx,ecx) are expected - * to be modified (either clobbered or used for return values). - * X86_64, on the other hand, already specifies a register-based calling - * conventions, returning at %rax, with parameteres going on %rdi, %rsi, - * %rdx, and %rcx. Note that for this reason, x86_64 does not need any - * special handling for dealing with 4 arguments, unlike i386. - * However, x86_64 also have to clobber all caller saved registers, which - * unfortunately, are quite a bit (r8 - r11) - * - * The call instruction itself is marked by placing its start address - * and size into the .parainstructions section, so that - * apply_paravirt() in arch/i386/kernel/alternative.c can do the - * appropriate patching under the control of the backend pv_init_ops - * implementation. - * - * Unfortunately there's no way to get gcc to generate the args setup - * for the call, and then allow the call itself to be generated by an - * inline asm. Because of this, we must do the complete arg setup and - * return value handling from within these macros. This is fairly - * cumbersome. - * - * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. - * It could be extended to more arguments, but there would be little - * to be gained from that. For each number of arguments, there are - * the two VCALL and CALL variants for void and non-void functions. - * - * When there is a return value, the invoker of the macro must specify - * the return type. The macro then uses sizeof() on that type to - * determine whether its a 32 or 64 bit value, and places the return - * in the right register(s) (just %eax for 32-bit, and %edx:%eax for - * 64-bit). For x86_64 machines, it just returns at %rax regardless of - * the return value size. - * - * 64-bit arguments are passed as a pair of adjacent 32-bit arguments - * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments - * in low,high order - * - * Small structures are passed and returned in registers. The macro - * calling convention can't directly deal with this, so the wrapper - * functions must do this. - * - * These PVOP_* macros are only defined within this header. This - * means that all uses must be wrapped in inline functions. This also - * makes sure the incoming and outgoing types are always correct. - */ -#ifdef CONFIG_X86_32 -#define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS - -#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS -#define VEXTRA_CLOBBERS -#else /* CONFIG_X86_64 */ -#define PVOP_VCALL_ARGS \ - unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx -#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax - -#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) -#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) -#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) -#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) - -#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ - "=S" (__esi), "=d" (__edx), \ - "=c" (__ecx) -#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) - -#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) -#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS - -#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" -#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_PARAVIRT_DEBUG -#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) -#else -#define PVOP_TEST_NULL(op) ((void)op) -#endif - -#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ - pre, post, ...) \ - ({ \ - rettype __ret; \ - PVOP_CALL_ARGS; \ - PVOP_TEST_NULL(op); \ - /* This is 32-bit specific, but is okay in 64-bit */ \ - /* since this condition will never hold */ \ - if (sizeof(rettype) > sizeof(unsigned long)) { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)((((u64)__edx) << 32) | __eax); \ - } else { \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ - } \ - __ret; \ - }) - -#define __PVOP_CALL(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ - EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) - -#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_CALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - -#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ - ({ \ - PVOP_VCALL_ARGS; \ - PVOP_TEST_NULL(op); \ - asm volatile(pre \ - paravirt_alt(PARAVIRT_CALL) \ - post \ - : call_clbr \ - : paravirt_type(op), \ - paravirt_clobber(clbr), \ - ##__VA_ARGS__ \ - : "memory", "cc" extra_clbr); \ - }) - -#define __PVOP_VCALL(op, pre, post, ...) \ - ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ - VEXTRA_CLOBBERS, \ - pre, post, ##__VA_ARGS__) - -#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ - ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ - PVOP_VCALLEE_CLOBBERS, , \ - pre, post, ##__VA_ARGS__) - - - -#define PVOP_CALL0(rettype, op) \ - __PVOP_CALL(rettype, op, "", "") -#define PVOP_VCALL0(op) \ - __PVOP_VCALL(op, "", "") - -#define PVOP_CALLEE0(rettype, op) \ - __PVOP_CALLEESAVE(rettype, op, "", "") -#define PVOP_VCALLEE0(op) \ - __PVOP_VCALLEESAVE(op, "", "") - - -#define PVOP_CALL1(rettype, op, arg1) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALL1(op, arg1) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) - -#define PVOP_CALLEE1(rettype, op, arg1) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) -#define PVOP_VCALLEE1(op, arg1) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) - - -#define PVOP_CALL2(rettype, op, arg1, arg2) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALL2(op, arg1, arg2) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - -#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ - __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) -#define PVOP_VCALLEE2(op, arg1, arg2) \ - __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2)) - - -#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ - __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) -#define PVOP_VCALL3(op, arg1, arg2, arg3) \ - __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ - PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) - -/* This is the only difference in x86_64. We can make it much simpler */ -#ifdef CONFIG_X86_32 -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, \ - "push %[_arg4];", "lea 4(%%esp),%%esp;", \ - "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ - "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) -#else -#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ - __PVOP_CALL(rettype, op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ - __PVOP_VCALL(op, "", "", \ - PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ - PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) -#endif static inline int paravirt_enabled(void) { @@ -1393,20 +715,6 @@ static inline void pmd_clear(pmd_t *pmdp) } #endif /* CONFIG_X86_PAE */ -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_start_context_switch(struct task_struct *prev); -void paravirt_end_context_switch(struct task_struct *next); - -void paravirt_enter_lazy_mmu(void); -void paravirt_leave_lazy_mmu(void); - #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { @@ -1437,12 +745,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, pv_mmu_ops.set_fixmap(idx, phys, flags); } -void _paravirt_nop(void); -u32 _paravirt_ident_32(u32); -u64 _paravirt_ident_64(u64); - -#define paravirt_nop ((void *)_paravirt_nop) - #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static inline int __raw_spin_is_locked(struct raw_spinlock *lock) @@ -1479,17 +781,6 @@ static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) #endif -/* These all sit in the .parainstructions section to tell us what to patch. */ -struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ - u8 len; /* length of original instruction */ - u16 clobbers; /* what registers you may clobber */ -}; - -extern struct paravirt_patch_site __parainstructions[], - __parainstructions_end[]; - #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" #define PV_RESTORE_REGS "popl %edx; popl %ecx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h new file mode 100644 index 00000000000..2b3371bae29 --- /dev/null +++ b/arch/x86/include/asm/paravirt_types.h @@ -0,0 +1,720 @@ +#ifndef _ASM_X86_PARAVIRT_TYPES_H +#define _ASM_X86_PARAVIRT_TYPES_H + +/* Bitmask of what can be clobbered: usually at least eax. */ +#define CLBR_NONE 0 +#define CLBR_EAX (1 << 0) +#define CLBR_ECX (1 << 1) +#define CLBR_EDX (1 << 2) +#define CLBR_EDI (1 << 3) + +#ifdef CONFIG_X86_32 +/* CLBR_ANY should match all regs platform has. For i386, that's just it */ +#define CLBR_ANY ((1 << 4) - 1) + +#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) +#define CLBR_SCRATCH (0) +#else +#define CLBR_RAX CLBR_EAX +#define CLBR_RCX CLBR_ECX +#define CLBR_RDX CLBR_EDX +#define CLBR_RDI CLBR_EDI +#define CLBR_RSI (1 << 4) +#define CLBR_R8 (1 << 5) +#define CLBR_R9 (1 << 6) +#define CLBR_R10 (1 << 7) +#define CLBR_R11 (1 << 8) + +#define CLBR_ANY ((1 << 9) - 1) + +#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \ + CLBR_RCX | CLBR_R8 | CLBR_R9) +#define CLBR_RET_REG (CLBR_RAX) +#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11) + +#endif /* X86_64 */ + +#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG) + +#ifndef __ASSEMBLY__ + +#include +#include + +struct page; +struct thread_struct; +struct desc_ptr; +struct tss_struct; +struct mm_struct; +struct desc_struct; +struct task_struct; +struct cpumask; + +/* + * Wrapper type for pointers to code which uses the non-standard + * calling convention. See PV_CALL_SAVE_REGS_THUNK below. + */ +struct paravirt_callee_save { + void *func; +}; + +/* general info */ +struct pv_info { + unsigned int kernel_rpl; + int shared_kernel_pmd; + int paravirt_enabled; + const char *name; +}; + +struct pv_init_ops { + /* + * Patch may replace one of the defined code sequences with + * arbitrary code, subject to the same register constraints. + * This generally means the code is not free to clobber any + * registers other than EAX. The patch function should return + * the number of bytes of code generated, as we nop pad the + * rest in generic code. + */ + unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, + unsigned long addr, unsigned len); + + /* Basic arch-specific setup */ + void (*arch_setup)(void); + char *(*memory_setup)(void); + void (*post_allocator_init)(void); + + /* Print a banner to identify the environment */ + void (*banner)(void); +}; + + +struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); +}; + +struct pv_time_ops { + void (*time_init)(void); + + /* Set and set time of day */ + unsigned long (*get_wallclock)(void); + int (*set_wallclock)(unsigned long); + + unsigned long long (*sched_clock)(void); + unsigned long (*get_tsc_khz)(void); +}; + +struct pv_cpu_ops { + /* hooks for various privileged instructions */ + unsigned long (*get_debugreg)(int regno); + void (*set_debugreg)(int regno, unsigned long value); + + void (*clts)(void); + + unsigned long (*read_cr0)(void); + void (*write_cr0)(unsigned long); + + unsigned long (*read_cr4_safe)(void); + unsigned long (*read_cr4)(void); + void (*write_cr4)(unsigned long); + +#ifdef CONFIG_X86_64 + unsigned long (*read_cr8)(void); + void (*write_cr8)(unsigned long); +#endif + + /* Segment descriptor handling */ + void (*load_tr_desc)(void); + void (*load_gdt)(const struct desc_ptr *); + void (*load_idt)(const struct desc_ptr *); + void (*store_gdt)(struct desc_ptr *); + void (*store_idt)(struct desc_ptr *); + void (*set_ldt)(const void *desc, unsigned entries); + unsigned long (*store_tr)(void); + void (*load_tls)(struct thread_struct *t, unsigned int cpu); +#ifdef CONFIG_X86_64 + void (*load_gs_index)(unsigned int idx); +#endif + void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum, + const void *desc); + void (*write_gdt_entry)(struct desc_struct *, + int entrynum, const void *desc, int size); + void (*write_idt_entry)(gate_desc *, + int entrynum, const gate_desc *gate); + void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); + void (*free_ldt)(struct desc_struct *ldt, unsigned entries); + + void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + + void (*set_iopl_mask)(unsigned mask); + + void (*wbinvd)(void); + void (*io_delay)(void); + + /* cpuid emulation, mostly so that caps bits can be disabled */ + void (*cpuid)(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + + /* MSR, PMC and TSR operations. + err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ + u64 (*read_msr_amd)(unsigned int msr, int *err); + u64 (*read_msr)(unsigned int msr, int *err); + int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + + u64 (*read_tsc)(void); + u64 (*read_pmc)(int counter); + unsigned long long (*read_tscp)(unsigned int *aux); + + /* + * Atomically enable interrupts and return to userspace. This + * is only ever used to return to 32-bit processes; in a + * 64-bit kernel, it's used for 32-on-64 compat processes, but + * never native 64-bit processes. (Jump, not call.) + */ + void (*irq_enable_sysexit)(void); + + /* + * Switch to usermode gs and return to 64-bit usermode using + * sysret. Only used in 64-bit kernels to return to 64-bit + * processes. Usermode register state, including %rsp, must + * already be restored. + */ + void (*usergs_sysret64)(void); + + /* + * Switch to usermode gs and return to 32-bit usermode using + * sysret. Used to return to 32-on-64 compat processes. + * Other usermode register state, including %esp, must already + * be restored. + */ + void (*usergs_sysret32)(void); + + /* Normal iret. Jump to this with the standard iret stack + frame set up. */ + void (*iret)(void); + + void (*swapgs)(void); + + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); +}; + +struct pv_irq_ops { + void (*init_IRQ)(void); + + /* + * Get/set interrupt state. save_fl and restore_fl are only + * expected to use X86_EFLAGS_IF; all other bits + * returned from save_fl are undefined, and may be ignored by + * restore_fl. + * + * NOTE: These functions callers expect the callee to preserve + * more registers than the standard C calling convention. + */ + struct paravirt_callee_save save_fl; + struct paravirt_callee_save restore_fl; + struct paravirt_callee_save irq_disable; + struct paravirt_callee_save irq_enable; + + void (*safe_halt)(void); + void (*halt)(void); + +#ifdef CONFIG_X86_64 + void (*adjust_exception_frame)(void); +#endif +}; + +struct pv_apic_ops { +#ifdef CONFIG_X86_LOCAL_APIC + void (*setup_boot_clock)(void); + void (*setup_secondary_clock)(void); + + void (*startup_ipi_hook)(int phys_apicid, + unsigned long start_eip, + unsigned long start_esp); +#endif +}; + +struct pv_mmu_ops { + /* + * Called before/after init_mm pagetable setup. setup_start + * may reset %cr3, and may pre-install parts of the pagetable; + * pagetable setup is expected to preserve any existing + * mapping. + */ + void (*pagetable_setup_start)(pgd_t *pgd_base); + void (*pagetable_setup_done)(pgd_t *pgd_base); + + unsigned long (*read_cr2)(void); + void (*write_cr2)(unsigned long); + + unsigned long (*read_cr3)(void); + void (*write_cr3)(unsigned long); + + /* + * Hooks for intercepting the creation/use/destruction of an + * mm_struct. + */ + void (*activate_mm)(struct mm_struct *prev, + struct mm_struct *next); + void (*dup_mmap)(struct mm_struct *oldmm, + struct mm_struct *mm); + void (*exit_mmap)(struct mm_struct *mm); + + + /* TLB operations */ + void (*flush_tlb_user)(void); + void (*flush_tlb_kernel)(void); + void (*flush_tlb_single)(unsigned long addr); + void (*flush_tlb_others)(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long va); + + /* Hooks for allocating and freeing a pagetable top-level */ + int (*pgd_alloc)(struct mm_struct *mm); + void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); + + /* + * Hooks for allocating/releasing pagetable pages when they're + * attached to a pagetable + */ + void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count); + void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*release_pte)(unsigned long pfn); + void (*release_pmd)(unsigned long pfn); + void (*release_pud)(unsigned long pfn); + + /* Pagetable manipulation functions */ + void (*set_pte)(pte_t *ptep, pte_t pteval); + void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); + void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*pte_update)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pte_update_defer)(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); + + pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); + + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; + + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + void (*set_pte_atomic)(pte_t *ptep, pte_t pteval); + void (*pte_clear)(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + void (*pmd_clear)(pmd_t *pmdp); + +#endif /* CONFIG_X86_PAE */ + + void (*set_pud)(pud_t *pudp, pud_t pudval); + + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; + +#if PAGETABLE_LEVELS == 4 + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; + + void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); +#endif /* PAGETABLE_LEVELS == 4 */ +#endif /* PAGETABLE_LEVELS >= 3 */ + +#ifdef CONFIG_HIGHPTE + void *(*kmap_atomic_pte)(struct page *page, enum km_type type); +#endif + + struct pv_lazy_ops lazy_mode; + + /* dom0 ops */ + + /* Sometimes the physical address is a pfn, and sometimes its + an mfn. We can tell which is which from the index. */ + void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags); +}; + +struct raw_spinlock; +struct pv_lock_ops { + int (*spin_is_locked)(struct raw_spinlock *lock); + int (*spin_is_contended)(struct raw_spinlock *lock); + void (*spin_lock)(struct raw_spinlock *lock); + void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); + int (*spin_trylock)(struct raw_spinlock *lock); + void (*spin_unlock)(struct raw_spinlock *lock); +}; + +/* This contains all the paravirt structures: we get a convenient + * number for each function using the offset which we use to indicate + * what to patch. */ +struct paravirt_patch_template { + struct pv_init_ops pv_init_ops; + struct pv_time_ops pv_time_ops; + struct pv_cpu_ops pv_cpu_ops; + struct pv_irq_ops pv_irq_ops; + struct pv_apic_ops pv_apic_ops; + struct pv_mmu_ops pv_mmu_ops; + struct pv_lock_ops pv_lock_ops; +}; + +extern struct pv_info pv_info; +extern struct pv_init_ops pv_init_ops; +extern struct pv_time_ops pv_time_ops; +extern struct pv_cpu_ops pv_cpu_ops; +extern struct pv_irq_ops pv_irq_ops; +extern struct pv_apic_ops pv_apic_ops; +extern struct pv_mmu_ops pv_mmu_ops; +extern struct pv_lock_ops pv_lock_ops; + +#define PARAVIRT_PATCH(x) \ + (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) + +#define paravirt_type(op) \ + [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \ + [paravirt_opptr] "i" (&(op)) +#define paravirt_clobber(clobber) \ + [paravirt_clobber] "i" (clobber) + +/* + * Generate some code, and mark it as patchable by the + * apply_paravirt() alternate instruction patcher. + */ +#define _paravirt_alt(insn_string, type, clobber) \ + "771:\n\t" insn_string "\n" "772:\n" \ + ".pushsection .parainstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR " 771b\n" \ + " .byte " type "\n" \ + " .byte 772b-771b\n" \ + " .short " clobber "\n" \ + ".popsection\n" + +/* Generate patchable code, with the default asm parameters. */ +#define paravirt_alt(insn_string) \ + _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]") + +/* Simple instruction patching code. */ +#define DEF_NATIVE(ops, name, code) \ + extern const char start_##ops##_##name[], end_##ops##_##name[]; \ + asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") + +unsigned paravirt_patch_nop(void); +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len); +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); +unsigned paravirt_patch_ignore(unsigned len); +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len); +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len); +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len); + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len); + +int paravirt_disable_iospace(void); + +/* + * This generates an indirect call based on the operation type number. + * The type number, computed in PARAVIRT_PATCH, is derived from the + * offset into the paravirt_patch_template structure, and can therefore be + * freely converted back into a structure offset. + */ +#define PARAVIRT_CALL "call *%c[paravirt_opptr];" + +/* + * These macros are intended to wrap calls through one of the paravirt + * ops structs, so that they can be later identified and patched at + * runtime. + * + * Normally, a call to a pv_op function is a simple indirect call: + * (pv_op_struct.operations)(args...). + * + * Unfortunately, this is a relatively slow operation for modern CPUs, + * because it cannot necessarily determine what the destination + * address is. In this case, the address is a runtime constant, so at + * the very least we can patch the call to e a simple direct call, or + * ideally, patch an inline implementation into the callsite. (Direct + * calls are essentially free, because the call and return addresses + * are completely predictable.) + * + * For i386, these macros rely on the standard gcc "regparm(3)" calling + * convention, in which the first three arguments are placed in %eax, + * %edx, %ecx (in that order), and the remaining arguments are placed + * on the stack. All caller-save registers (eax,edx,ecx) are expected + * to be modified (either clobbered or used for return values). + * X86_64, on the other hand, already specifies a register-based calling + * conventions, returning at %rax, with parameteres going on %rdi, %rsi, + * %rdx, and %rcx. Note that for this reason, x86_64 does not need any + * special handling for dealing with 4 arguments, unlike i386. + * However, x86_64 also have to clobber all caller saved registers, which + * unfortunately, are quite a bit (r8 - r11) + * + * The call instruction itself is marked by placing its start address + * and size into the .parainstructions section, so that + * apply_paravirt() in arch/i386/kernel/alternative.c can do the + * appropriate patching under the control of the backend pv_init_ops + * implementation. + * + * Unfortunately there's no way to get gcc to generate the args setup + * for the call, and then allow the call itself to be generated by an + * inline asm. Because of this, we must do the complete arg setup and + * return value handling from within these macros. This is fairly + * cumbersome. + * + * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments. + * It could be extended to more arguments, but there would be little + * to be gained from that. For each number of arguments, there are + * the two VCALL and CALL variants for void and non-void functions. + * + * When there is a return value, the invoker of the macro must specify + * the return type. The macro then uses sizeof() on that type to + * determine whether its a 32 or 64 bit value, and places the return + * in the right register(s) (just %eax for 32-bit, and %edx:%eax for + * 64-bit). For x86_64 machines, it just returns at %rax regardless of + * the return value size. + * + * 64-bit arguments are passed as a pair of adjacent 32-bit arguments + * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments + * in low,high order + * + * Small structures are passed and returned in registers. The macro + * calling convention can't directly deal with this, so the wrapper + * functions must do this. + * + * These PVOP_* macros are only defined within this header. This + * means that all uses must be wrapped in inline functions. This also + * makes sure the incoming and outgoing types are always correct. + */ +#ifdef CONFIG_X86_32 +#define PVOP_VCALL_ARGS \ + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS + +#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS +#define VEXTRA_CLOBBERS +#else /* CONFIG_X86_64 */ +#define PVOP_VCALL_ARGS \ + unsigned long __edi = __edi, __esi = __esi, \ + __edx = __edx, __ecx = __ecx +#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax + +#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) +#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x)) +#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x)) +#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x)) + +#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ + "=S" (__esi), "=d" (__edx), \ + "=c" (__ecx) +#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) + +#define PVOP_VCALLEE_CLOBBERS "=a" (__eax) +#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS + +#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" +#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_PARAVIRT_DEBUG +#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) +#else +#define PVOP_TEST_NULL(op) ((void)op) +#endif + +#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ + pre, post, ...) \ + ({ \ + rettype __ret; \ + PVOP_CALL_ARGS; \ + PVOP_TEST_NULL(op); \ + /* This is 32-bit specific, but is okay in 64-bit */ \ + /* since this condition will never hold */ \ + if (sizeof(rettype) > sizeof(unsigned long)) { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)((((u64)__edx) << 32) | __eax); \ + } else { \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + __ret = (rettype)__eax; \ + } \ + __ret; \ + }) + +#define __PVOP_CALL(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \ + EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__) + +#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_CALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + +#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \ + ({ \ + PVOP_VCALL_ARGS; \ + PVOP_TEST_NULL(op); \ + asm volatile(pre \ + paravirt_alt(PARAVIRT_CALL) \ + post \ + : call_clbr \ + : paravirt_type(op), \ + paravirt_clobber(clbr), \ + ##__VA_ARGS__ \ + : "memory", "cc" extra_clbr); \ + }) + +#define __PVOP_VCALL(op, pre, post, ...) \ + ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \ + VEXTRA_CLOBBERS, \ + pre, post, ##__VA_ARGS__) + +#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \ + ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \ + PVOP_VCALLEE_CLOBBERS, , \ + pre, post, ##__VA_ARGS__) + + + +#define PVOP_CALL0(rettype, op) \ + __PVOP_CALL(rettype, op, "", "") +#define PVOP_VCALL0(op) \ + __PVOP_VCALL(op, "", "") + +#define PVOP_CALLEE0(rettype, op) \ + __PVOP_CALLEESAVE(rettype, op, "", "") +#define PVOP_VCALLEE0(op) \ + __PVOP_VCALLEESAVE(op, "", "") + + +#define PVOP_CALL1(rettype, op, arg1) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALL1(op, arg1) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1)) + +#define PVOP_CALLEE1(rettype, op, arg1) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1)) +#define PVOP_VCALLEE1(op, arg1) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1)) + + +#define PVOP_CALL2(rettype, op, arg1, arg2) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALL2(op, arg1, arg2) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + +#define PVOP_CALLEE2(rettype, op, arg1, arg2) \ + __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) +#define PVOP_VCALLEE2(op, arg1, arg2) \ + __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2)) + + +#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ + __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) +#define PVOP_VCALL3(op, arg1, arg2, arg3) \ + __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \ + PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3)) + +/* This is the only difference in x86_64. We can make it much simpler */ +#ifdef CONFIG_X86_32 +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4))) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, \ + "push %[_arg4];", "lea 4(%%esp),%%esp;", \ + "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ + "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) +#else +#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ + __PVOP_CALL(rettype, op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ + __PVOP_VCALL(op, "", "", \ + PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \ + PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4)) +#endif + +/* Lazy mode for batching updates / context switch */ +enum paravirt_lazy_mode { + PARAVIRT_LAZY_NONE, + PARAVIRT_LAZY_MMU, + PARAVIRT_LAZY_CPU, +}; + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + +void paravirt_enter_lazy_mmu(void); +void paravirt_leave_lazy_mmu(void); + +void _paravirt_nop(void); +u32 _paravirt_ident_32(u32); +u64 _paravirt_ident_64(u64); + +#define paravirt_nop ((void *)_paravirt_nop) + +/* These all sit in the .parainstructions section to tell us what to patch. */ +struct paravirt_patch_site { + u8 *instr; /* original instructions */ + u8 instrtype; /* type of this instruction */ + u8 len; /* length of original instruction */ + u16 clobbers; /* what registers you may clobber */ +}; + +extern struct paravirt_patch_site __parainstructions[], + __parainstructions_end[]; + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PARAVIRT_TYPES_H */ -- cgit v1.2.3-70-g09d2 From e6e9cac8c3417b43498b243c1f8f11780e157168 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:40:59 -0700 Subject: x86: split out core __math_state_restore Split the core fpu state restoration out into __math_state_restore, which assumes that cr0.TS is clear and that the fpu context has been initialized. This will be used during context switch. There are two reasons this is desireable: - There's a small clarification. When __switch_to() calls math_state_restore, it relies on the fact that tsk_used_math() returns true, and so will never do a blocking init_fpu(). __math_state_restore() does not have (or need) that logic, so the question never arises. - It allows the clts() to be moved earler in __switch_to() so it can be performed while cpu context updates are batched (will be done in a later patch). [ Impact: refactor code to make reuse cleaner; no functional change ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/include/asm/i387.h | 1 + arch/x86/kernel/traps.c | 33 +++++++++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 175adf58dd4..2e7529295f5 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -26,6 +26,7 @@ extern void fpu_init(void); extern void mxcsr_feature_mask_init(void); extern int init_fpu(struct task_struct *child); extern asmlinkage void math_state_restore(void); +extern void __math_state_restore(void); extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5f935f0d586..71b91669ad1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -813,6 +813,28 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) { } +/* + * __math_state_restore assumes that cr0.TS is already clear and the + * fpu state is all ready for use. Used during context switch. + */ +void __math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } + + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task @@ -844,17 +866,8 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; + __math_state_restore(); } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.3-70-g09d2 From 2fcddce10f6771cfa0c56fd1e826d50d67d100b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:45:26 -0700 Subject: x86-32: make sure clts is batched during context switch If we're preloading the fpu state during context switch, make sure the clts happens while we're batching the cpu context update, then do the actual __math_state_restore once the updates are flushed. This allows more efficient context switches when running paravirtualized, as all the hypercalls can be folded together into one. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_32.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984a..a80eddd4165 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -350,14 +350,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - __unlazy_fpu(prev_p); + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; + __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -398,6 +405,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); + /* If we're going to preload the fpu context, make sure clts + is run while we're batching the cpu state updates. */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -407,15 +419,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() - */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); /* * Restore %gs if needed (which is common) -- cgit v1.2.3-70-g09d2 From 16d9dbf0c2bd167fdd942b83592d59696c7b73bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:50:27 -0700 Subject: x86-64: move unlazy_fpu() into lazy cpu state part of context switch Make sure that unlazy_fpu()'s stts gets batched along with the other cpu state changes during context switch. (32-bit already does this.) This makes sure it gets batched when running paravirtualized. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9..c9b8904736d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); + /* Must be after DS reload */ + unlazy_fpu(prev_p); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -459,9 +462,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; - /* Must be after DS reload */ - unlazy_fpu(prev_p); - /* * Switch the PDA and FPU contexts. */ -- cgit v1.2.3-70-g09d2 From 17950c5b243f99cbabef173415ee988c52104d7e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 01:01:01 -0700 Subject: x86-64: move clts into batch cpu state updates when preloading fpu When a task is likely to be using the fpu, we preload its state during the context switch, rather than waiting for it to run an fpu instruction. Make sure the clts() happens while we're doing batched fpu state updates to optimise paravirtualized context switches. [ Impact: optimise paravirtual FPU context switch ] Signed-off-by: Jeremy Fitzhardinge Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/kernel/process_64.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c9b8904736d..a28279dbb07 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -386,9 +386,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; + bool preload_fpu; + + /* + * If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter > 5) + if (preload_fpu) prefetch(next->xstate); /* @@ -422,6 +430,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ unlazy_fpu(prev_p); + /* Make sure cpu is ready for new context */ + if (preload_fpu) + clts(); + /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so @@ -480,15 +492,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); - /* If the task has used fpu the last 5 timeslices, just do a full - * restore of the math state immediately to avoid the trap; the - * chances of needing FPU soon are obviously high now - * - * tsk_used_math() checks prevent calling math_state_restore(), - * which can sleep in the case of !tsk_used_math() + /* + * Preload the FPU context, now that we've determined that the + * task is likely to be using it. */ - if (tsk_used_math(next_p) && next_p->fpu_counter > 5) - math_state_restore(); + if (preload_fpu) + __math_state_restore(); return prev_p; } -- cgit v1.2.3-70-g09d2 From 21e70878215f620fe99ea7d7c74bc641aeec932f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 18 Jun 2009 17:09:27 +0530 Subject: x86: oprofile/op_model_amd.c set return values for op_amd_handle_ibs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit op_amd_handle_ibs() should return 0 when IBS is not present or not defined. Fix compilation warning: CC [M] arch/x86/oprofile/op_model_amd.o arch/x86/oprofile/op_model_amd.c: In function ‘op_amd_handle_ibs’: arch/x86/oprofile/op_model_amd.c:217: warning: no return statement in function returning non-void Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index cc930467575..e95268eb922 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -132,7 +132,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 1; + return 0; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -214,7 +214,10 @@ static void op_amd_stop_ibs(void) #else static inline int op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } + struct op_msrs const * const msrs) +{ + return 0; +} static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3-70-g09d2 From 4adc667593f83a18a8e54ce94f250fd166a272ac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:16 +0200 Subject: x86: add copies of some headers to convert to asm-generic Just an intermediate step to make reviewing easier. These files are identical copies of the existing headers. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 20 ++++++++ arch/x86/include/asm/generic-module.h | 80 ++++++++++++++++++++++++++++++ arch/x86/include/asm/generic-scatterlist.h | 33 ++++++++++++ arch/x86/include/asm/generic-types.h | 30 +++++++++++ arch/x86/include/asm/generic-ucontext.h | 18 +++++++ 5 files changed, 181 insertions(+) create mode 100644 arch/x86/include/asm/generic-mman.h create mode 100644 arch/x86/include/asm/generic-module.h create mode 100644 arch/x86/include/asm/generic-scatterlist.h create mode 100644 arch/x86/include/asm/generic-types.h create mode 100644 arch/x86/include/asm/generic-ucontext.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h new file mode 100644 index 00000000000..751af2550ed --- /dev/null +++ b/arch/x86/include/asm/generic-mman.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_MMAN_H +#define _ASM_X86_MMAN_H + +#include + +#define MAP_32BIT 0x40 /* only give out 32bit addresses */ + +#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ +#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ +#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ +#define MAP_LOCKED 0x2000 /* pages are locked */ +#define MAP_NORESERVE 0x4000 /* don't check for reservations */ +#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ +#define MAP_NONBLOCK 0x10000 /* do not block on IO */ +#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ + +#define MCL_CURRENT 1 /* lock all current mappings */ +#define MCL_FUTURE 2 /* lock all future mappings */ + +#endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h new file mode 100644 index 00000000000..47d62743c4d --- /dev/null +++ b/arch/x86/include/asm/generic-module.h @@ -0,0 +1,80 @@ +#ifndef _ASM_X86_MODULE_H +#define _ASM_X86_MODULE_H + +/* x86_32/64 are simple */ +struct mod_arch_specific {}; + +#ifdef CONFIG_X86_32 +# define Elf_Shdr Elf32_Shdr +# define Elf_Sym Elf32_Sym +# define Elf_Ehdr Elf32_Ehdr +#else +# define Elf_Shdr Elf64_Shdr +# define Elf_Sym Elf64_Sym +# define Elf_Ehdr Elf64_Ehdr +#endif + +#ifdef CONFIG_X86_64 +/* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M386 +#define MODULE_PROC_FAMILY "386 " +#elif defined CONFIG_M486 +#define MODULE_PROC_FAMILY "486 " +#elif defined CONFIG_M586 +#define MODULE_PROC_FAMILY "586 " +#elif defined CONFIG_M586TSC +#define MODULE_PROC_FAMILY "586TSC " +#elif defined CONFIG_M586MMX +#define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_M686 +#define MODULE_PROC_FAMILY "686 " +#elif defined CONFIG_MPENTIUMII +#define MODULE_PROC_FAMILY "PENTIUMII " +#elif defined CONFIG_MPENTIUMIII +#define MODULE_PROC_FAMILY "PENTIUMIII " +#elif defined CONFIG_MPENTIUMM +#define MODULE_PROC_FAMILY "PENTIUMM " +#elif defined CONFIG_MPENTIUM4 +#define MODULE_PROC_FAMILY "PENTIUM4 " +#elif defined CONFIG_MK6 +#define MODULE_PROC_FAMILY "K6 " +#elif defined CONFIG_MK7 +#define MODULE_PROC_FAMILY "K7 " +#elif defined CONFIG_MK8 +#define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_X86_ELAN +#define MODULE_PROC_FAMILY "ELAN " +#elif defined CONFIG_MCRUSOE +#define MODULE_PROC_FAMILY "CRUSOE " +#elif defined CONFIG_MEFFICEON +#define MODULE_PROC_FAMILY "EFFICEON " +#elif defined CONFIG_MWINCHIPC6 +#define MODULE_PROC_FAMILY "WINCHIPC6 " +#elif defined CONFIG_MWINCHIP3D +#define MODULE_PROC_FAMILY "WINCHIP3D " +#elif defined CONFIG_MCYRIXIII +#define MODULE_PROC_FAMILY "CYRIXIII " +#elif defined CONFIG_MVIAC3_2 +#define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " +#elif defined CONFIG_MGEODEGX1 +#define MODULE_PROC_FAMILY "GEODEGX1 " +#elif defined CONFIG_MGEODE_LX +#define MODULE_PROC_FAMILY "GEODE " +#else +#error unknown processor family +#endif + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_4KSTACKS +# define MODULE_STACKSIZE "4KSTACKS " +# else +# define MODULE_STACKSIZE "" +# endif +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +#endif + +#endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h new file mode 100644 index 00000000000..263d397d2ee --- /dev/null +++ b/arch/x86/include/asm/generic-scatterlist.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_SCATTERLIST_H +#define _ASM_X86_SCATTERLIST_H + +#include + +struct scatterlist { +#ifdef CONFIG_DEBUG_SG + unsigned long sg_magic; +#endif + unsigned long page_link; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; + unsigned int dma_length; +}; + +#define ARCH_HAS_SG_CHAIN +#define ISA_DMA_THRESHOLD (0x00ffffff) + +/* + * These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#ifdef CONFIG_X86_32 +# define sg_dma_len(sg) ((sg)->length) +#else +# define sg_dma_len(sg) ((sg)->dma_length) +#endif + +#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h new file mode 100644 index 00000000000..09b97745772 --- /dev/null +++ b/arch/x86/include/asm/generic-types.h @@ -0,0 +1,30 @@ +#ifndef _ASM_X86_TYPES_H +#define _ASM_X86_TYPES_H + +#include + +#ifndef __ASSEMBLY__ + +typedef unsigned short umode_t; + +#endif /* __ASSEMBLY__ */ + +/* + * These aren't exported outside the kernel to avoid name space clashes + */ +#ifdef __KERNEL__ + +#ifndef __ASSEMBLY__ + +typedef u64 dma64_addr_t; +#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) +/* DMA addresses come in 32-bit and 64-bit flavours. */ +typedef u64 dma_addr_t; +#else +typedef u32 dma_addr_t; +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h new file mode 100644 index 00000000000..87324cf439d --- /dev/null +++ b/arch/x86/include/asm/generic-ucontext.h @@ -0,0 +1,18 @@ +#ifndef _ASM_X86_UCONTEXT_H +#define _ASM_X86_UCONTEXT_H + +#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state + * information in the memory layout pointed + * by the fpstate pointer in the ucontext's + * sigcontext struct (uc_mcontext). + */ + +struct ucontext { + unsigned long uc_flags; + struct ucontext *uc_link; + stack_t uc_stack; + struct sigcontext uc_mcontext; + sigset_t uc_sigmask; /* mask last for extensibility */ +}; + +#endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3-70-g09d2 From 7bfd124d6dae7d394e73753300594a81a022fe7d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:17 +0200 Subject: x86: convert trivial headers to asm-generic version For these nine header files, the asm-generic version should be semantically identical to what is in x86. Change the contents to be binary identical, for better review. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/ioctls.h | 40 ++++++++++++++++++++++++++++------------ arch/x86/include/asm/ipcbuf.h | 15 ++++++++++----- arch/x86/include/asm/msgbuf.h | 30 +++++++++++++++++++----------- arch/x86/include/asm/param.h | 8 +++++--- arch/x86/include/asm/shmbuf.h | 28 ++++++++++++++++++---------- arch/x86/include/asm/socket.h | 10 +++++----- arch/x86/include/asm/sockios.h | 6 +++--- arch/x86/include/asm/termbits.h | 8 ++++---- 8 files changed, 92 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index 0d5b23b7b06..a799e20a769 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,12 +1,23 @@ -#ifndef _ASM_X86_IOCTLS_H -#define _ASM_X86_IOCTLS_H +#ifndef __ASM_GENERIC_IOCTLS_H +#define __ASM_GENERIC_IOCTLS_H -#include +#include + +/* + * These are the most common definitions for tty ioctl numbers. + * Most of them do not use the recommended _IOC(), but there is + * probably some source code out there hardcoding the number, + * so we might as well use them for all new platforms. + * + * The architectures that use different values here typically + * try to be compatible with some Unix variants for the same + * architecture. + */ /* 0x54 is just a magic number to make these relatively unique ('T') */ #define TCGETS 0x5401 -#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */ +#define TCSETS 0x5402 #define TCSETSW 0x5403 #define TCSETSF 0x5404 #define TCGETA 0x5405 @@ -43,7 +54,6 @@ #define TIOCSETD 0x5423 #define TIOCGETD 0x5424 #define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */ #define TIOCSBRK 0x5427 /* BSD compatibility */ #define TIOCCBRK 0x5428 /* BSD compatibility */ #define TIOCGSID 0x5429 /* Return the session ID of FD */ @@ -53,8 +63,7 @@ #define TCSETSF2 _IOW('T', 0x2D, struct termios2) #define TIOCGRS485 0x542E #define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) - /* Get Pty Number (of pty-mux device) */ +#define TIOCGPTN _IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ #define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ #define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ #define TCSETX 0x5433 @@ -76,9 +85,16 @@ #define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ #define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ -#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -#define FIOQSIZE 0x5460 + +/* + * some architectures define FIOQSIZE as 0x545E, which is used for + * TIOCGHAYESESP on others + */ +#ifndef FIOQSIZE +# define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ +# define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ +# define FIOQSIZE 0x5460 +#endif /* Used for packet mode */ #define TIOCPKT_DATA 0 @@ -89,6 +105,6 @@ #define TIOCPKT_NOSTOP 16 #define TIOCPKT_DOSTOP 32 -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ +#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ -#endif /* _ASM_X86_IOCTLS_H */ +#endif /* __ASM_GENERIC_IOCTLS_H */ diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index ee678fd5159..888ca1c8f95 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,13 +1,18 @@ -#ifndef _ASM_X86_IPCBUF_H -#define _ASM_X86_IPCBUF_H +#ifndef __ASM_GENERIC_IPCBUF_H +#define __ASM_GENERIC_IPCBUF_H /* - * The ipc64_perm structure for x86 architecture. + * The generic ipc64_perm structure: * Note extra padding because this structure is passed back and forth * between kernel and user space. * + * ipc64_perm was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * * Pad space is left for: - * - 32-bit mode_t and seq + * - 32-bit mode_t on architectures that only had 16 bit + * - 32-bit seq * - 2 miscellaneous 32-bit values */ @@ -25,4 +30,4 @@ struct ipc64_perm { unsigned long __unused2; }; -#endif /* _ASM_X86_IPCBUF_H */ +#endif /* __ASM_GENERIC_IPCBUF_H */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index 7e4e9481f51..aec850d9159 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,30 +1,38 @@ -#ifndef _ASM_X86_MSGBUF_H -#define _ASM_X86_MSGBUF_H +#ifndef __ASM_GENERIC_MSGBUF_H +#define __ASM_GENERIC_MSGBUF_H +#include /* - * The msqid64_ds structure for i386 architecture. + * generic msqid64_ds structure. + * * Note extra padding because this structure is passed back and forth * between kernel and user space. * - * Pad space on i386 is left for: + * msqid64_ds was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * + * 64 bit architectures typically define a 64 bit __kernel_time_t, + * so they do not need the first three padding words. + * On big-endian systems, the padding is in the wrong place. + * + * Pad space is left for: * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values - * - * Pad space on x8664 is left for: - * - 2 miscellaneous 64-bit values */ + struct msqid64_ds { struct ipc64_perm msg_perm; __kernel_time_t msg_stime; /* last msgsnd time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused1; #endif __kernel_time_t msg_rtime; /* last msgrcv time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused2; #endif __kernel_time_t msg_ctime; /* last change time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused3; #endif unsigned long msg_cbytes; /* current number of bytes on queue */ @@ -36,4 +44,4 @@ struct msqid64_ds { unsigned long __unused5; }; -#endif /* _ASM_X86_MSGBUF_H */ +#endif /* __ASM_GENERIC_MSGBUF_H */ diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index 6f0d0422f4c..cdf8251bfb6 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_PARAM_H -#define _ASM_X86_PARAM_H +#ifndef __ASM_GENERIC_PARAM_H +#define __ASM_GENERIC_PARAM_H #ifdef __KERNEL__ # define HZ CONFIG_HZ /* Internal kernel timer frequency */ @@ -11,7 +11,9 @@ #define HZ 100 #endif +#ifndef EXEC_PAGESIZE #define EXEC_PAGESIZE 4096 +#endif #ifndef NOGROUP #define NOGROUP (-1) @@ -19,4 +21,4 @@ #define MAXHOSTNAMELEN 64 /* max length of hostname */ -#endif /* _ASM_X86_PARAM_H */ +#endif /* __ASM_GENERIC_PARAM_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index b51413b7497..5768fa60ac8 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,32 +1,40 @@ -#ifndef _ASM_X86_SHMBUF_H -#define _ASM_X86_SHMBUF_H +#ifndef __ASM_GENERIC_SHMBUF_H +#define __ASM_GENERIC_SHMBUF_H + +#include /* * The shmid64_ds structure for x86 architecture. * Note extra padding because this structure is passed back and forth * between kernel and user space. * - * Pad space on 32 bit is left for: + * shmid64_ds was originally meant to be architecture specific, but + * everyone just ended up making identical copies without specific + * optimizations, so we may just as well all use the same one. + * + * 64 bit architectures typically define a 64 bit __kernel_time_t, + * so they do not need the first two padding words. + * On big-endian systems, the padding is in the wrong place. + * + * + * Pad space is left for: * - 64-bit time_t to solve y2038 problem * - 2 miscellaneous 32-bit values - * - * Pad space on 64 bit is left for: - * - 2 miscellaneous 64-bit values */ struct shmid64_ds { struct ipc64_perm shm_perm; /* operation perms */ size_t shm_segsz; /* size of segment (bytes) */ __kernel_time_t shm_atime; /* last attach time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused1; #endif __kernel_time_t shm_dtime; /* last detach time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused2; #endif __kernel_time_t shm_ctime; /* last change time */ -#ifdef __i386__ +#if __BITS_PER_LONG != 64 unsigned long __unused3; #endif __kernel_pid_t shm_cpid; /* pid of creator */ @@ -48,4 +56,4 @@ struct shminfo64 { unsigned long __unused4; }; -#endif /* _ASM_X86_SHMBUF_H */ +#endif /* __ASM_GENERIC_SHMBUF_H */ diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index ca8bf2cd0ba..d4ae42a06a2 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SOCKET_H -#define _ASM_X86_SOCKET_H +#ifndef __ASM_GENERIC_SOCKET_H +#define __ASM_GENERIC_SOCKET_H #include @@ -38,8 +38,8 @@ #define SO_BINDTODEVICE 25 /* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 +#define SO_ATTACH_FILTER 26 +#define SO_DETACH_FILTER 27 #define SO_PEERNAME 28 #define SO_TIMESTAMP 29 @@ -57,4 +57,4 @@ #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING -#endif /* _ASM_X86_SOCKET_H */ +#endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 49cc72b5d3c..9a61a369b90 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_SOCKIOS_H -#define _ASM_X86_SOCKIOS_H +#ifndef __ASM_GENERIC_SOCKIOS_H +#define __ASM_GENERIC_SOCKIOS_H /* Socket-level I/O control calls. */ #define FIOSETOWN 0x8901 @@ -10,4 +10,4 @@ #define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ #define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ -#endif /* _ASM_X86_SOCKIOS_H */ +#endif /* __ASM_GENERIC_SOCKIOS_H */ diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index af1b70ea440..1c9773d48cb 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_TERMBITS_H -#define _ASM_X86_TERMBITS_H +#ifndef __ASM_GENERIC_TERMBITS_H +#define __ASM_GENERIC_TERMBITS_H #include @@ -140,7 +140,7 @@ struct ktermios { #define HUPCL 0002000 #define CLOCAL 0004000 #define CBAUDEX 0010000 -#define BOTHER 0010000 /* non standard rate */ +#define BOTHER 0010000 #define B57600 0010001 #define B115200 0010002 #define B230400 0010003 @@ -195,4 +195,4 @@ struct ktermios { #define TCSADRAIN 1 #define TCSAFLUSH 2 -#endif /* _ASM_X86_TERMBITS_H */ +#endif /* __ASM_GENERIC_TERMBITS_H */ -- cgit v1.2.3-70-g09d2 From 06f5013aa8eb5895ced2c71d13f5114103605555 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:18 +0200 Subject: x86: convert almost generic headers to asm-generic version In x86, mman.h, module.h, scatterlist.h, types.h and ucontext.h can use the asm-generic version by just defining the x86 specific parts locally and falling back on the generic code for the common bits. This patch illustrates the differences between the x86 and asm-generic versions by changing a file that is initially identical to the x86 version to one that is identical to the asm-generic version. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 8 +-- arch/x86/include/asm/generic-module.h | 92 ++++++------------------------ arch/x86/include/asm/generic-scatterlist.h | 34 +++++++---- arch/x86/include/asm/generic-types.h | 32 +++++++---- arch/x86/include/asm/generic-ucontext.h | 12 +--- arch/x86/include/asm/mman.h | 14 +---- arch/x86/include/asm/module.h | 13 +---- arch/x86/include/asm/scatterlist.h | 27 +-------- arch/x86/include/asm/types.h | 12 +--- arch/x86/include/asm/ucontext.h | 8 +-- 10 files changed, 73 insertions(+), 179 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h index 751af2550ed..7cab4de2bca 100644 --- a/arch/x86/include/asm/generic-mman.h +++ b/arch/x86/include/asm/generic-mman.h @@ -1,10 +1,8 @@ -#ifndef _ASM_X86_MMAN_H -#define _ASM_X86_MMAN_H +#ifndef __ASM_GENERIC_MMAN_H +#define __ASM_GENERIC_MMAN_H #include -#define MAP_32BIT 0x40 /* only give out 32bit addresses */ - #define MAP_GROWSDOWN 0x0100 /* stack-like segment */ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ @@ -17,4 +15,4 @@ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ -#endif /* _ASM_X86_MMAN_H */ +#endif /* __ASM_GENERIC_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h index 47d62743c4d..ed5b44de4c9 100644 --- a/arch/x86/include/asm/generic-module.h +++ b/arch/x86/include/asm/generic-module.h @@ -1,80 +1,22 @@ -#ifndef _ASM_X86_MODULE_H -#define _ASM_X86_MODULE_H +#ifndef __ASM_GENERIC_MODULE_H +#define __ASM_GENERIC_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; +/* + * Many architectures just need a simple module + * loader without arch specific data. + */ +struct mod_arch_specific +{ +}; -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr +#ifdef CONFIG_64BIT +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Ehdr Elf64_Ehdr #else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr #endif -#ifdef CONFIG_X86_64 -/* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M386 -#define MODULE_PROC_FAMILY "386 " -#elif defined CONFIG_M486 -#define MODULE_PROC_FAMILY "486 " -#elif defined CONFIG_M586 -#define MODULE_PROC_FAMILY "586 " -#elif defined CONFIG_M586TSC -#define MODULE_PROC_FAMILY "586TSC " -#elif defined CONFIG_M586MMX -#define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " -#elif defined CONFIG_M686 -#define MODULE_PROC_FAMILY "686 " -#elif defined CONFIG_MPENTIUMII -#define MODULE_PROC_FAMILY "PENTIUMII " -#elif defined CONFIG_MPENTIUMIII -#define MODULE_PROC_FAMILY "PENTIUMIII " -#elif defined CONFIG_MPENTIUMM -#define MODULE_PROC_FAMILY "PENTIUMM " -#elif defined CONFIG_MPENTIUM4 -#define MODULE_PROC_FAMILY "PENTIUM4 " -#elif defined CONFIG_MK6 -#define MODULE_PROC_FAMILY "K6 " -#elif defined CONFIG_MK7 -#define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_X86_ELAN -#define MODULE_PROC_FAMILY "ELAN " -#elif defined CONFIG_MCRUSOE -#define MODULE_PROC_FAMILY "CRUSOE " -#elif defined CONFIG_MEFFICEON -#define MODULE_PROC_FAMILY "EFFICEON " -#elif defined CONFIG_MWINCHIPC6 -#define MODULE_PROC_FAMILY "WINCHIPC6 " -#elif defined CONFIG_MWINCHIP3D -#define MODULE_PROC_FAMILY "WINCHIP3D " -#elif defined CONFIG_MCYRIXIII -#define MODULE_PROC_FAMILY "CYRIXIII " -#elif defined CONFIG_MVIAC3_2 -#define MODULE_PROC_FAMILY "VIAC3-2 " -#elif defined CONFIG_MVIAC7 -#define MODULE_PROC_FAMILY "VIAC7 " -#elif defined CONFIG_MGEODEGX1 -#define MODULE_PROC_FAMILY "GEODEGX1 " -#elif defined CONFIG_MGEODE_LX -#define MODULE_PROC_FAMILY "GEODE " -#else -#error unknown processor family -#endif - -#ifdef CONFIG_X86_32 -# ifdef CONFIG_4KSTACKS -# define MODULE_STACKSIZE "4KSTACKS " -# else -# define MODULE_STACKSIZE "" -# endif -# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE -#endif - -#endif /* _ASM_X86_MODULE_H */ +#endif /* __ASM_GENERIC_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h index 263d397d2ee..8b9454496a7 100644 --- a/arch/x86/include/asm/generic-scatterlist.h +++ b/arch/x86/include/asm/generic-scatterlist.h @@ -1,7 +1,7 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H +#ifndef __ASM_GENERIC_SCATTERLIST_H +#define __ASM_GENERIC_SCATTERLIST_H -#include +#include struct scatterlist { #ifdef CONFIG_DEBUG_SG @@ -14,20 +14,30 @@ struct scatterlist { unsigned int dma_length; }; -#define ARCH_HAS_SG_CHAIN -#define ISA_DMA_THRESHOLD (0x00ffffff) - /* - * These macros should be used after a pci_map_sg call has been done + * These macros should be used after a dma_map_sg call has been done * to get bus addresses of each of the SG entries and their lengths. * You should only work with the number of sg entries pci_map_sg - * returns. + * returns, or alternatively stop on the first sg_dma_len(sg) which + * is 0. */ #define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) +#ifndef sg_dma_len +/* + * Normally, you have an iommu on 64 bit machines, but not on 32 bit + * machines. Architectures that are differnt should override this. + */ +#if __BITS_PER_LONG == 64 +#define sg_dma_len(sg) ((sg)->dma_length) #else -# define sg_dma_len(sg) ((sg)->dma_length) +#define sg_dma_len(sg) ((sg)->length) +#endif /* 64 bit */ +#endif /* sg_dma_len */ + +#ifndef ISA_DMA_THRESHOLD +#define ISA_DMA_THRESHOLD (~0UL) #endif -#endif /* _ASM_X86_SCATTERLIST_H */ +#define ARCH_HAS_SG_CHAIN + +#endif /* __ASM_GENERIC_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h index 09b97745772..fba7d33ca3f 100644 --- a/arch/x86/include/asm/generic-types.h +++ b/arch/x86/include/asm/generic-types.h @@ -1,6 +1,9 @@ -#ifndef _ASM_X86_TYPES_H -#define _ASM_X86_TYPES_H - +#ifndef _ASM_GENERIC_TYPES_H +#define _ASM_GENERIC_TYPES_H +/* + * int-ll64 is used practically everywhere now, + * so use it as a reasonable default. + */ #include #ifndef __ASSEMBLY__ @@ -13,18 +16,27 @@ typedef unsigned short umode_t; * These aren't exported outside the kernel to avoid name space clashes */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ +/* + * DMA addresses may be very different from physical addresses + * and pointers. i386 and powerpc may have 64 bit DMA on 32 bit + * systems, while sparc64 uses 32 bit DMA addresses for 64 bit + * physical addresses. + * This default defines dma_addr_t to have the same size as + * phys_addr_t, which is the most common way. + * Do not define the dma64_addr_t type, which never really + * worked. + */ +#ifndef dma_addr_t +#ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 dma_addr_t; #else typedef u32 dma_addr_t; -#endif +#endif /* CONFIG_PHYS_ADDR_T_64BIT */ +#endif /* dma_addr_t */ #endif /* __ASSEMBLY__ */ + #endif /* __KERNEL__ */ -#endif /* _ASM_X86_TYPES_H */ +#endif /* _ASM_GENERIC_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h index 87324cf439d..ad77343e8a9 100644 --- a/arch/x86/include/asm/generic-ucontext.h +++ b/arch/x86/include/asm/generic-ucontext.h @@ -1,11 +1,5 @@ -#ifndef _ASM_X86_UCONTEXT_H -#define _ASM_X86_UCONTEXT_H - -#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state - * information in the memory layout pointed - * by the fpstate pointer in the ucontext's - * sigcontext struct (uc_mcontext). - */ +#ifndef __ASM_GENERIC_UCONTEXT_H +#define __ASM_GENERIC_UCONTEXT_H struct ucontext { unsigned long uc_flags; @@ -15,4 +9,4 @@ struct ucontext { sigset_t uc_sigmask; /* mask last for extensibility */ }; -#endif /* _ASM_X86_UCONTEXT_H */ +#endif /* __ASM_GENERIC_UCONTEXT_H */ diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 751af2550ed..063d8c9e4d6 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -1,20 +1,8 @@ #ifndef _ASM_X86_MMAN_H #define _ASM_X86_MMAN_H -#include - #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ +#include #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d..4a7a192910d 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,18 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -/* x86_32/64 are simple */ -struct mod_arch_specific {}; - -#ifdef CONFIG_X86_32 -# define Elf_Shdr Elf32_Shdr -# define Elf_Sym Elf32_Sym -# define Elf_Ehdr Elf32_Ehdr -#else -# define Elf_Shdr Elf64_Shdr -# define Elf_Sym Elf64_Sym -# define Elf_Ehdr Elf64_Ehdr -#endif +#include #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 263d397d2ee..2097d686471 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,33 +1,8 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -#define ARCH_HAS_SG_CHAIN #define ISA_DMA_THRESHOLD (0x00ffffff) -/* - * These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifdef CONFIG_X86_32 -# define sg_dma_len(sg) ((sg)->length) -#else -# define sg_dma_len(sg) ((sg)->dma_length) -#endif +#include #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index 09b97745772..f2fe528e968 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,19 +1,11 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#include +#define dma_addr_t dma_addr_t -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; +#include -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ #ifdef __KERNEL__ - #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 87324cf439d..7cfc436f86d 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,12 +7,6 @@ * sigcontext struct (uc_mcontext). */ -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; +#include #endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3-70-g09d2 From 69d5ffdaad7b77b97229b55c36afb20e5bebd29e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:19 +0200 Subject: x86: convert termios.h to the asm-generic version This patch turned out more controversial than expected and may get dropped in the future. I'm including it for reference anyway. The user_termio_to_kernel_termios and kernel_termios_to_user_termio functions on x86 are lacking error checking from get_user and are not portable to big-endian systems, so the asm-generic header has to differ in this regard. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/termios.h | 86 +++++++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index c4ee8056bac..d0922adc56d 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,5 +1,12 @@ -#ifndef _ASM_X86_TERMIOS_H -#define _ASM_X86_TERMIOS_H +#ifndef _ASM_GENERIC_TERMIOS_H +#define _ASM_GENERIC_TERMIOS_H +/* + * Most architectures have straight copies of the x86 code, with + * varying levels of bug fixes on top. Usually it's a good idea + * to use this generic version instead, but be careful to avoid + * ABI changes. + * New architectures should not provide their own version. + */ #include #include @@ -54,37 +61,57 @@ struct termio { /* * Translate a "termio" structure into a "termios". Ugh. */ -#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \ - unsigned short __tmp; \ - get_user(__tmp,&(termio)->x); \ - *(unsigned short *) &(termios)->x = __tmp; \ -} - static inline int user_termio_to_kernel_termios(struct ktermios *termios, - struct termio __user *termio) + const struct termio __user *termio) { - SET_LOW_TERMIOS_BITS(termios, termio, c_iflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_oflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_cflag); - SET_LOW_TERMIOS_BITS(termios, termio, c_lflag); - get_user(termios->c_line, &termio->c_line); - return copy_from_user(termios->c_cc, termio->c_cc, NCC); + unsigned short tmp; + + if (get_user(tmp, &termio->c_iflag) < 0) + goto fault; + termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; + + if (get_user(tmp, &termio->c_oflag) < 0) + goto fault; + termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; + + if (get_user(tmp, &termio->c_cflag) < 0) + goto fault; + termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; + + if (get_user(tmp, &termio->c_lflag) < 0) + goto fault; + termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; + + if (get_user(termios->c_line, &termio->c_line) < 0) + goto fault; + + if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) + goto fault; + + return 0; + + fault: + return -EFAULT; } /* * Translate a "termios" structure into a "termio". Ugh. */ static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) + struct ktermios *termios) { - put_user((termios)->c_iflag, &(termio)->c_iflag); - put_user((termios)->c_oflag, &(termio)->c_oflag); - put_user((termios)->c_cflag, &(termio)->c_cflag); - put_user((termios)->c_lflag, &(termio)->c_lflag); - put_user((termios)->c_line, &(termio)->c_line); - return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC); + if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || + put_user(termios->c_oflag, &termio->c_oflag) < 0 || + put_user(termios->c_cflag, &termio->c_cflag) < 0 || + put_user(termios->c_lflag, &termio->c_lflag) < 0 || + put_user(termios->c_line, &termio->c_line) < 0 || + copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) + return -EFAULT; + + return 0; } +#ifdef TCGETS2 static inline int user_termios_to_kernel_termios(struct ktermios *k, struct termios2 __user *u) { @@ -108,7 +135,20 @@ static inline int kernel_termios_to_user_termios_1(struct termios __user *u, { return copy_to_user(u, k, sizeof(struct termios)); } +#else /* TCGETS2 */ +static inline int user_termios_to_kernel_termios(struct ktermios *k, + struct termios __user *u) +{ + return copy_from_user(k, u, sizeof(struct termios)); +} + +static inline int kernel_termios_to_user_termios(struct termios __user *u, + struct ktermios *k) +{ + return copy_to_user(u, k, sizeof(struct termios)); +} +#endif /* TCGETS2 */ #endif /* __KERNEL__ */ -#endif /* _ASM_X86_TERMIOS_H */ +#endif /* _ASM_GENERIC_TERMIOS_H */ -- cgit v1.2.3-70-g09d2 From 73a2d096fdf23aa841f7595d114a11ec85a85e4d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Jun 2009 21:48:20 +0200 Subject: x86: remove all now-duplicate header files All files that have been made identical to the asm-generic version in the previous patches can now be removed, guaranteeing that this does not introduce semantic changes. Signed-off-by: Arnd Bergmann LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/generic-mman.h | 18 --- arch/x86/include/asm/generic-module.h | 22 ---- arch/x86/include/asm/generic-scatterlist.h | 43 ------- arch/x86/include/asm/generic-types.h | 42 ------ arch/x86/include/asm/generic-ucontext.h | 12 -- arch/x86/include/asm/ioctls.h | 111 +--------------- arch/x86/include/asm/ipcbuf.h | 34 +---- arch/x86/include/asm/mman.h | 2 +- arch/x86/include/asm/module.h | 2 +- arch/x86/include/asm/msgbuf.h | 48 +------ arch/x86/include/asm/param.h | 25 +--- arch/x86/include/asm/scatterlist.h | 2 +- arch/x86/include/asm/shmbuf.h | 60 +-------- arch/x86/include/asm/socket.h | 61 +-------- arch/x86/include/asm/sockios.h | 14 +- arch/x86/include/asm/termbits.h | 199 +---------------------------- arch/x86/include/asm/termios.h | 155 +--------------------- arch/x86/include/asm/types.h | 2 +- arch/x86/include/asm/ucontext.h | 2 +- 19 files changed, 14 insertions(+), 840 deletions(-) delete mode 100644 arch/x86/include/asm/generic-mman.h delete mode 100644 arch/x86/include/asm/generic-module.h delete mode 100644 arch/x86/include/asm/generic-scatterlist.h delete mode 100644 arch/x86/include/asm/generic-types.h delete mode 100644 arch/x86/include/asm/generic-ucontext.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/generic-mman.h b/arch/x86/include/asm/generic-mman.h deleted file mode 100644 index 7cab4de2bca..00000000000 --- a/arch/x86/include/asm/generic-mman.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef __ASM_GENERIC_MMAN_H -#define __ASM_GENERIC_MMAN_H - -#include - -#define MAP_GROWSDOWN 0x0100 /* stack-like segment */ -#define MAP_DENYWRITE 0x0800 /* ETXTBSY */ -#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ -#define MAP_LOCKED 0x2000 /* pages are locked */ -#define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ - -#define MCL_CURRENT 1 /* lock all current mappings */ -#define MCL_FUTURE 2 /* lock all future mappings */ - -#endif /* __ASM_GENERIC_MMAN_H */ diff --git a/arch/x86/include/asm/generic-module.h b/arch/x86/include/asm/generic-module.h deleted file mode 100644 index ed5b44de4c9..00000000000 --- a/arch/x86/include/asm/generic-module.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef __ASM_GENERIC_MODULE_H -#define __ASM_GENERIC_MODULE_H - -/* - * Many architectures just need a simple module - * loader without arch specific data. - */ -struct mod_arch_specific -{ -}; - -#ifdef CONFIG_64BIT -#define Elf_Shdr Elf64_Shdr -#define Elf_Sym Elf64_Sym -#define Elf_Ehdr Elf64_Ehdr -#else -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr -#endif - -#endif /* __ASM_GENERIC_MODULE_H */ diff --git a/arch/x86/include/asm/generic-scatterlist.h b/arch/x86/include/asm/generic-scatterlist.h deleted file mode 100644 index 8b9454496a7..00000000000 --- a/arch/x86/include/asm/generic-scatterlist.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef __ASM_GENERIC_SCATTERLIST_H -#define __ASM_GENERIC_SCATTERLIST_H - -#include - -struct scatterlist { -#ifdef CONFIG_DEBUG_SG - unsigned long sg_magic; -#endif - unsigned long page_link; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -/* - * These macros should be used after a dma_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns, or alternatively stop on the first sg_dma_len(sg) which - * is 0. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#ifndef sg_dma_len -/* - * Normally, you have an iommu on 64 bit machines, but not on 32 bit - * machines. Architectures that are differnt should override this. - */ -#if __BITS_PER_LONG == 64 -#define sg_dma_len(sg) ((sg)->dma_length) -#else -#define sg_dma_len(sg) ((sg)->length) -#endif /* 64 bit */ -#endif /* sg_dma_len */ - -#ifndef ISA_DMA_THRESHOLD -#define ISA_DMA_THRESHOLD (~0UL) -#endif - -#define ARCH_HAS_SG_CHAIN - -#endif /* __ASM_GENERIC_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/generic-types.h b/arch/x86/include/asm/generic-types.h deleted file mode 100644 index fba7d33ca3f..00000000000 --- a/arch/x86/include/asm/generic-types.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _ASM_GENERIC_TYPES_H -#define _ASM_GENERIC_TYPES_H -/* - * int-ll64 is used practically everywhere now, - * so use it as a reasonable default. - */ -#include - -#ifndef __ASSEMBLY__ - -typedef unsigned short umode_t; - -#endif /* __ASSEMBLY__ */ - -/* - * These aren't exported outside the kernel to avoid name space clashes - */ -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ -/* - * DMA addresses may be very different from physical addresses - * and pointers. i386 and powerpc may have 64 bit DMA on 32 bit - * systems, while sparc64 uses 32 bit DMA addresses for 64 bit - * physical addresses. - * This default defines dma_addr_t to have the same size as - * phys_addr_t, which is the most common way. - * Do not define the dma64_addr_t type, which never really - * worked. - */ -#ifndef dma_addr_t -#ifdef CONFIG_PHYS_ADDR_T_64BIT -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif /* CONFIG_PHYS_ADDR_T_64BIT */ -#endif /* dma_addr_t */ - -#endif /* __ASSEMBLY__ */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_GENERIC_TYPES_H */ diff --git a/arch/x86/include/asm/generic-ucontext.h b/arch/x86/include/asm/generic-ucontext.h deleted file mode 100644 index ad77343e8a9..00000000000 --- a/arch/x86/include/asm/generic-ucontext.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __ASM_GENERIC_UCONTEXT_H -#define __ASM_GENERIC_UCONTEXT_H - -struct ucontext { - unsigned long uc_flags; - struct ucontext *uc_link; - stack_t uc_stack; - struct sigcontext uc_mcontext; - sigset_t uc_sigmask; /* mask last for extensibility */ -}; - -#endif /* __ASM_GENERIC_UCONTEXT_H */ diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h index a799e20a769..ec34c760665 100644 --- a/arch/x86/include/asm/ioctls.h +++ b/arch/x86/include/asm/ioctls.h @@ -1,110 +1 @@ -#ifndef __ASM_GENERIC_IOCTLS_H -#define __ASM_GENERIC_IOCTLS_H - -#include - -/* - * These are the most common definitions for tty ioctl numbers. - * Most of them do not use the recommended _IOC(), but there is - * probably some source code out there hardcoding the number, - * so we might as well use them for all new platforms. - * - * The architectures that use different values here typically - * try to be compatible with some Unix variants for the same - * architecture. - */ - -/* 0x54 is just a magic number to make these relatively unique ('T') */ - -#define TCGETS 0x5401 -#define TCSETS 0x5402 -#define TCSETSW 0x5403 -#define TCSETSF 0x5404 -#define TCGETA 0x5405 -#define TCSETA 0x5406 -#define TCSETAW 0x5407 -#define TCSETAF 0x5408 -#define TCSBRK 0x5409 -#define TCXONC 0x540A -#define TCFLSH 0x540B -#define TIOCEXCL 0x540C -#define TIOCNXCL 0x540D -#define TIOCSCTTY 0x540E -#define TIOCGPGRP 0x540F -#define TIOCSPGRP 0x5410 -#define TIOCOUTQ 0x5411 -#define TIOCSTI 0x5412 -#define TIOCGWINSZ 0x5413 -#define TIOCSWINSZ 0x5414 -#define TIOCMGET 0x5415 -#define TIOCMBIS 0x5416 -#define TIOCMBIC 0x5417 -#define TIOCMSET 0x5418 -#define TIOCGSOFTCAR 0x5419 -#define TIOCSSOFTCAR 0x541A -#define FIONREAD 0x541B -#define TIOCINQ FIONREAD -#define TIOCLINUX 0x541C -#define TIOCCONS 0x541D -#define TIOCGSERIAL 0x541E -#define TIOCSSERIAL 0x541F -#define TIOCPKT 0x5420 -#define FIONBIO 0x5421 -#define TIOCNOTTY 0x5422 -#define TIOCSETD 0x5423 -#define TIOCGETD 0x5424 -#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */ -#define TIOCSBRK 0x5427 /* BSD compatibility */ -#define TIOCCBRK 0x5428 /* BSD compatibility */ -#define TIOCGSID 0x5429 /* Return the session ID of FD */ -#define TCGETS2 _IOR('T', 0x2A, struct termios2) -#define TCSETS2 _IOW('T', 0x2B, struct termios2) -#define TCSETSW2 _IOW('T', 0x2C, struct termios2) -#define TCSETSF2 _IOW('T', 0x2D, struct termios2) -#define TIOCGRS485 0x542E -#define TIOCSRS485 0x542F -#define TIOCGPTN _IOR('T', 0x30, unsigned int) /* Get Pty Number (of pty-mux device) */ -#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */ -#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */ -#define TCSETX 0x5433 -#define TCSETXF 0x5434 -#define TCSETXW 0x5435 - -#define FIONCLEX 0x5450 -#define FIOCLEX 0x5451 -#define FIOASYNC 0x5452 -#define TIOCSERCONFIG 0x5453 -#define TIOCSERGWILD 0x5454 -#define TIOCSERSWILD 0x5455 -#define TIOCGLCKTRMIOS 0x5456 -#define TIOCSLCKTRMIOS 0x5457 -#define TIOCSERGSTRUCT 0x5458 /* For debugging only */ -#define TIOCSERGETLSR 0x5459 /* Get line status register */ -#define TIOCSERGETMULTI 0x545A /* Get multiport config */ -#define TIOCSERSETMULTI 0x545B /* Set multiport config */ - -#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */ -#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */ - -/* - * some architectures define FIOQSIZE as 0x545E, which is used for - * TIOCGHAYESESP on others - */ -#ifndef FIOQSIZE -# define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */ -# define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */ -# define FIOQSIZE 0x5460 -#endif - -/* Used for packet mode */ -#define TIOCPKT_DATA 0 -#define TIOCPKT_FLUSHREAD 1 -#define TIOCPKT_FLUSHWRITE 2 -#define TIOCPKT_STOP 4 -#define TIOCPKT_START 8 -#define TIOCPKT_NOSTOP 16 -#define TIOCPKT_DOSTOP 32 - -#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */ - -#endif /* __ASM_GENERIC_IOCTLS_H */ +#include diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h index 888ca1c8f95..84c7e51cb6d 100644 --- a/arch/x86/include/asm/ipcbuf.h +++ b/arch/x86/include/asm/ipcbuf.h @@ -1,33 +1 @@ -#ifndef __ASM_GENERIC_IPCBUF_H -#define __ASM_GENERIC_IPCBUF_H - -/* - * The generic ipc64_perm structure: - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * ipc64_perm was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * Pad space is left for: - * - 32-bit mode_t on architectures that only had 16 bit - * - 32-bit seq - * - 2 miscellaneous 32-bit values - */ - -struct ipc64_perm { - __kernel_key_t key; - __kernel_uid32_t uid; - __kernel_gid32_t gid; - __kernel_uid32_t cuid; - __kernel_gid32_t cgid; - __kernel_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned long __unused1; - unsigned long __unused2; -}; - -#endif /* __ASM_GENERIC_IPCBUF_H */ +#include diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h index 063d8c9e4d6..593e51d4643 100644 --- a/arch/x86/include/asm/mman.h +++ b/arch/x86/include/asm/mman.h @@ -3,6 +3,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ -#include +#include #endif /* _ASM_X86_MMAN_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 4a7a192910d..555bc12bdcd 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_MODULE_H #define _ASM_X86_MODULE_H -#include +#include #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h index aec850d9159..809134c644a 100644 --- a/arch/x86/include/asm/msgbuf.h +++ b/arch/x86/include/asm/msgbuf.h @@ -1,47 +1 @@ -#ifndef __ASM_GENERIC_MSGBUF_H -#define __ASM_GENERIC_MSGBUF_H - -#include -/* - * generic msqid64_ds structure. - * - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * msqid64_ds was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * 64 bit architectures typically define a 64 bit __kernel_time_t, - * so they do not need the first three padding words. - * On big-endian systems, the padding is in the wrong place. - * - * Pad space is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - */ - -struct msqid64_ds { - struct ipc64_perm msg_perm; - __kernel_time_t msg_stime; /* last msgsnd time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif - __kernel_time_t msg_rtime; /* last msgrcv time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif - __kernel_time_t msg_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; -#endif - unsigned long msg_cbytes; /* current number of bytes on queue */ - unsigned long msg_qnum; /* number of messages in queue */ - unsigned long msg_qbytes; /* max number of bytes on queue */ - __kernel_pid_t msg_lspid; /* pid of last msgsnd */ - __kernel_pid_t msg_lrpid; /* last receive pid */ - unsigned long __unused4; - unsigned long __unused5; -}; - -#endif /* __ASM_GENERIC_MSGBUF_H */ +#include diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h index cdf8251bfb6..965d4542797 100644 --- a/arch/x86/include/asm/param.h +++ b/arch/x86/include/asm/param.h @@ -1,24 +1 @@ -#ifndef __ASM_GENERIC_PARAM_H -#define __ASM_GENERIC_PARAM_H - -#ifdef __KERNEL__ -# define HZ CONFIG_HZ /* Internal kernel timer frequency */ -# define USER_HZ 100 /* some user interfaces are */ -# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#ifndef EXEC_PAGESIZE -#define EXEC_PAGESIZE 4096 -#endif - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif /* __ASM_GENERIC_PARAM_H */ +#include diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 2097d686471..75af592677e 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -3,6 +3,6 @@ #define ISA_DMA_THRESHOLD (0x00ffffff) -#include +#include #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h index 5768fa60ac8..83c05fc2de3 100644 --- a/arch/x86/include/asm/shmbuf.h +++ b/arch/x86/include/asm/shmbuf.h @@ -1,59 +1 @@ -#ifndef __ASM_GENERIC_SHMBUF_H -#define __ASM_GENERIC_SHMBUF_H - -#include - -/* - * The shmid64_ds structure for x86 architecture. - * Note extra padding because this structure is passed back and forth - * between kernel and user space. - * - * shmid64_ds was originally meant to be architecture specific, but - * everyone just ended up making identical copies without specific - * optimizations, so we may just as well all use the same one. - * - * 64 bit architectures typically define a 64 bit __kernel_time_t, - * so they do not need the first two padding words. - * On big-endian systems, the padding is in the wrong place. - * - * - * Pad space is left for: - * - 64-bit time_t to solve y2038 problem - * - 2 miscellaneous 32-bit values - */ - -struct shmid64_ds { - struct ipc64_perm shm_perm; /* operation perms */ - size_t shm_segsz; /* size of segment (bytes) */ - __kernel_time_t shm_atime; /* last attach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused1; -#endif - __kernel_time_t shm_dtime; /* last detach time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused2; -#endif - __kernel_time_t shm_ctime; /* last change time */ -#if __BITS_PER_LONG != 64 - unsigned long __unused3; -#endif - __kernel_pid_t shm_cpid; /* pid of creator */ - __kernel_pid_t shm_lpid; /* pid of last operator */ - unsigned long shm_nattch; /* no. of current attaches */ - unsigned long __unused4; - unsigned long __unused5; -}; - -struct shminfo64 { - unsigned long shmmax; - unsigned long shmmin; - unsigned long shmmni; - unsigned long shmseg; - unsigned long shmall; - unsigned long __unused1; - unsigned long __unused2; - unsigned long __unused3; - unsigned long __unused4; -}; - -#endif /* __ASM_GENERIC_SHMBUF_H */ +#include diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h index d4ae42a06a2..6b71384b9d8 100644 --- a/arch/x86/include/asm/socket.h +++ b/arch/x86/include/asm/socket.h @@ -1,60 +1 @@ -#ifndef __ASM_GENERIC_SOCKET_H -#define __ASM_GENERIC_SOCKET_H - -#include - -/* For setsockopt(2) */ -#define SOL_SOCKET 1 - -#define SO_DEBUG 1 -#define SO_REUSEADDR 2 -#define SO_TYPE 3 -#define SO_ERROR 4 -#define SO_DONTROUTE 5 -#define SO_BROADCAST 6 -#define SO_SNDBUF 7 -#define SO_RCVBUF 8 -#define SO_SNDBUFFORCE 32 -#define SO_RCVBUFFORCE 33 -#define SO_KEEPALIVE 9 -#define SO_OOBINLINE 10 -#define SO_NO_CHECK 11 -#define SO_PRIORITY 12 -#define SO_LINGER 13 -#define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ -#define SO_PASSCRED 16 -#define SO_PEERCRED 17 -#define SO_RCVLOWAT 18 -#define SO_SNDLOWAT 19 -#define SO_RCVTIMEO 20 -#define SO_SNDTIMEO 21 - -/* Security levels - as per NRL IPv6 - don't actually do anything */ -#define SO_SECURITY_AUTHENTICATION 22 -#define SO_SECURITY_ENCRYPTION_TRANSPORT 23 -#define SO_SECURITY_ENCRYPTION_NETWORK 24 - -#define SO_BINDTODEVICE 25 - -/* Socket filtering */ -#define SO_ATTACH_FILTER 26 -#define SO_DETACH_FILTER 27 - -#define SO_PEERNAME 28 -#define SO_TIMESTAMP 29 -#define SCM_TIMESTAMP SO_TIMESTAMP - -#define SO_ACCEPTCONN 30 - -#define SO_PEERSEC 31 -#define SO_PASSSEC 34 -#define SO_TIMESTAMPNS 35 -#define SCM_TIMESTAMPNS SO_TIMESTAMPNS - -#define SO_MARK 36 - -#define SO_TIMESTAMPING 37 -#define SCM_TIMESTAMPING SO_TIMESTAMPING - -#endif /* __ASM_GENERIC_SOCKET_H */ +#include diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h index 9a61a369b90..def6d4746ee 100644 --- a/arch/x86/include/asm/sockios.h +++ b/arch/x86/include/asm/sockios.h @@ -1,13 +1 @@ -#ifndef __ASM_GENERIC_SOCKIOS_H -#define __ASM_GENERIC_SOCKIOS_H - -/* Socket-level I/O control calls. */ -#define FIOSETOWN 0x8901 -#define SIOCSPGRP 0x8902 -#define FIOGETOWN 0x8903 -#define SIOCGPGRP 0x8904 -#define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ - -#endif /* __ASM_GENERIC_SOCKIOS_H */ +#include diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h index 1c9773d48cb..3935b106de7 100644 --- a/arch/x86/include/asm/termbits.h +++ b/arch/x86/include/asm/termbits.h @@ -1,198 +1 @@ -#ifndef __ASM_GENERIC_TERMBITS_H -#define __ASM_GENERIC_TERMBITS_H - -#include - -typedef unsigned char cc_t; -typedef unsigned int speed_t; -typedef unsigned int tcflag_t; - -#define NCCS 19 -struct termios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ -}; - -struct termios2 { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -struct ktermios { - tcflag_t c_iflag; /* input mode flags */ - tcflag_t c_oflag; /* output mode flags */ - tcflag_t c_cflag; /* control mode flags */ - tcflag_t c_lflag; /* local mode flags */ - cc_t c_line; /* line discipline */ - cc_t c_cc[NCCS]; /* control characters */ - speed_t c_ispeed; /* input speed */ - speed_t c_ospeed; /* output speed */ -}; - -/* c_cc characters */ -#define VINTR 0 -#define VQUIT 1 -#define VERASE 2 -#define VKILL 3 -#define VEOF 4 -#define VTIME 5 -#define VMIN 6 -#define VSWTC 7 -#define VSTART 8 -#define VSTOP 9 -#define VSUSP 10 -#define VEOL 11 -#define VREPRINT 12 -#define VDISCARD 13 -#define VWERASE 14 -#define VLNEXT 15 -#define VEOL2 16 - -/* c_iflag bits */ -#define IGNBRK 0000001 -#define BRKINT 0000002 -#define IGNPAR 0000004 -#define PARMRK 0000010 -#define INPCK 0000020 -#define ISTRIP 0000040 -#define INLCR 0000100 -#define IGNCR 0000200 -#define ICRNL 0000400 -#define IUCLC 0001000 -#define IXON 0002000 -#define IXANY 0004000 -#define IXOFF 0010000 -#define IMAXBEL 0020000 -#define IUTF8 0040000 - -/* c_oflag bits */ -#define OPOST 0000001 -#define OLCUC 0000002 -#define ONLCR 0000004 -#define OCRNL 0000010 -#define ONOCR 0000020 -#define ONLRET 0000040 -#define OFILL 0000100 -#define OFDEL 0000200 -#define NLDLY 0000400 -#define NL0 0000000 -#define NL1 0000400 -#define CRDLY 0003000 -#define CR0 0000000 -#define CR1 0001000 -#define CR2 0002000 -#define CR3 0003000 -#define TABDLY 0014000 -#define TAB0 0000000 -#define TAB1 0004000 -#define TAB2 0010000 -#define TAB3 0014000 -#define XTABS 0014000 -#define BSDLY 0020000 -#define BS0 0000000 -#define BS1 0020000 -#define VTDLY 0040000 -#define VT0 0000000 -#define VT1 0040000 -#define FFDLY 0100000 -#define FF0 0000000 -#define FF1 0100000 - -/* c_cflag bit meaning */ -#define CBAUD 0010017 -#define B0 0000000 /* hang up */ -#define B50 0000001 -#define B75 0000002 -#define B110 0000003 -#define B134 0000004 -#define B150 0000005 -#define B200 0000006 -#define B300 0000007 -#define B600 0000010 -#define B1200 0000011 -#define B1800 0000012 -#define B2400 0000013 -#define B4800 0000014 -#define B9600 0000015 -#define B19200 0000016 -#define B38400 0000017 -#define EXTA B19200 -#define EXTB B38400 -#define CSIZE 0000060 -#define CS5 0000000 -#define CS6 0000020 -#define CS7 0000040 -#define CS8 0000060 -#define CSTOPB 0000100 -#define CREAD 0000200 -#define PARENB 0000400 -#define PARODD 0001000 -#define HUPCL 0002000 -#define CLOCAL 0004000 -#define CBAUDEX 0010000 -#define BOTHER 0010000 -#define B57600 0010001 -#define B115200 0010002 -#define B230400 0010003 -#define B460800 0010004 -#define B500000 0010005 -#define B576000 0010006 -#define B921600 0010007 -#define B1000000 0010010 -#define B1152000 0010011 -#define B1500000 0010012 -#define B2000000 0010013 -#define B2500000 0010014 -#define B3000000 0010015 -#define B3500000 0010016 -#define B4000000 0010017 -#define CIBAUD 002003600000 /* input baud rate */ -#define CMSPAR 010000000000 /* mark or space (stick) parity */ -#define CRTSCTS 020000000000 /* flow control */ - -#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */ - -/* c_lflag bits */ -#define ISIG 0000001 -#define ICANON 0000002 -#define XCASE 0000004 -#define ECHO 0000010 -#define ECHOE 0000020 -#define ECHOK 0000040 -#define ECHONL 0000100 -#define NOFLSH 0000200 -#define TOSTOP 0000400 -#define ECHOCTL 0001000 -#define ECHOPRT 0002000 -#define ECHOKE 0004000 -#define FLUSHO 0010000 -#define PENDIN 0040000 -#define IEXTEN 0100000 - -/* tcflow() and TCXONC use these */ -#define TCOOFF 0 -#define TCOON 1 -#define TCIOFF 2 -#define TCION 3 - -/* tcflush() and TCFLSH use these */ -#define TCIFLUSH 0 -#define TCOFLUSH 1 -#define TCIOFLUSH 2 - -/* tcsetattr uses these */ -#define TCSANOW 0 -#define TCSADRAIN 1 -#define TCSAFLUSH 2 - -#endif /* __ASM_GENERIC_TERMBITS_H */ +#include diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h index d0922adc56d..280d78a9d96 100644 --- a/arch/x86/include/asm/termios.h +++ b/arch/x86/include/asm/termios.h @@ -1,154 +1 @@ -#ifndef _ASM_GENERIC_TERMIOS_H -#define _ASM_GENERIC_TERMIOS_H -/* - * Most architectures have straight copies of the x86 code, with - * varying levels of bug fixes on top. Usually it's a good idea - * to use this generic version instead, but be careful to avoid - * ABI changes. - * New architectures should not provide their own version. - */ - -#include -#include - -struct winsize { - unsigned short ws_row; - unsigned short ws_col; - unsigned short ws_xpixel; - unsigned short ws_ypixel; -}; - -#define NCC 8 -struct termio { - unsigned short c_iflag; /* input mode flags */ - unsigned short c_oflag; /* output mode flags */ - unsigned short c_cflag; /* control mode flags */ - unsigned short c_lflag; /* local mode flags */ - unsigned char c_line; /* line discipline */ - unsigned char c_cc[NCC]; /* control characters */ -}; - -/* modem lines */ -#define TIOCM_LE 0x001 -#define TIOCM_DTR 0x002 -#define TIOCM_RTS 0x004 -#define TIOCM_ST 0x008 -#define TIOCM_SR 0x010 -#define TIOCM_CTS 0x020 -#define TIOCM_CAR 0x040 -#define TIOCM_RNG 0x080 -#define TIOCM_DSR 0x100 -#define TIOCM_CD TIOCM_CAR -#define TIOCM_RI TIOCM_RNG -#define TIOCM_OUT1 0x2000 -#define TIOCM_OUT2 0x4000 -#define TIOCM_LOOP 0x8000 - -/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */ - -#ifdef __KERNEL__ - -#include - -/* intr=^C quit=^\ erase=del kill=^U - eof=^D vtime=\0 vmin=\1 sxtc=\0 - start=^Q stop=^S susp=^Z eol=\0 - reprint=^R discard=^U werase=^W lnext=^V - eol2=\0 -*/ -#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0" - -/* - * Translate a "termio" structure into a "termios". Ugh. - */ -static inline int user_termio_to_kernel_termios(struct ktermios *termios, - const struct termio __user *termio) -{ - unsigned short tmp; - - if (get_user(tmp, &termio->c_iflag) < 0) - goto fault; - termios->c_iflag = (0xffff0000 & termios->c_iflag) | tmp; - - if (get_user(tmp, &termio->c_oflag) < 0) - goto fault; - termios->c_oflag = (0xffff0000 & termios->c_oflag) | tmp; - - if (get_user(tmp, &termio->c_cflag) < 0) - goto fault; - termios->c_cflag = (0xffff0000 & termios->c_cflag) | tmp; - - if (get_user(tmp, &termio->c_lflag) < 0) - goto fault; - termios->c_lflag = (0xffff0000 & termios->c_lflag) | tmp; - - if (get_user(termios->c_line, &termio->c_line) < 0) - goto fault; - - if (copy_from_user(termios->c_cc, termio->c_cc, NCC) != 0) - goto fault; - - return 0; - - fault: - return -EFAULT; -} - -/* - * Translate a "termios" structure into a "termio". Ugh. - */ -static inline int kernel_termios_to_user_termio(struct termio __user *termio, - struct ktermios *termios) -{ - if (put_user(termios->c_iflag, &termio->c_iflag) < 0 || - put_user(termios->c_oflag, &termio->c_oflag) < 0 || - put_user(termios->c_cflag, &termio->c_cflag) < 0 || - put_user(termios->c_lflag, &termio->c_lflag) < 0 || - put_user(termios->c_line, &termio->c_line) < 0 || - copy_to_user(termio->c_cc, termios->c_cc, NCC) != 0) - return -EFAULT; - - return 0; -} - -#ifdef TCGETS2 -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios2 __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios2)); -} - -static inline int kernel_termios_to_user_termios(struct termios2 __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios2)); -} - -static inline int user_termios_to_kernel_termios_1(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios_1(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} -#else /* TCGETS2 */ -static inline int user_termios_to_kernel_termios(struct ktermios *k, - struct termios __user *u) -{ - return copy_from_user(k, u, sizeof(struct termios)); -} - -static inline int kernel_termios_to_user_termios(struct termios __user *u, - struct ktermios *k) -{ - return copy_to_user(u, k, sizeof(struct termios)); -} -#endif /* TCGETS2 */ - -#endif /* __KERNEL__ */ - -#endif /* _ASM_GENERIC_TERMIOS_H */ +#include diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index f2fe528e968..df1da20f453 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -3,7 +3,7 @@ #define dma_addr_t dma_addr_t -#include +#include #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h index 7cfc436f86d..b7c29c8017f 100644 --- a/arch/x86/include/asm/ucontext.h +++ b/arch/x86/include/asm/ucontext.h @@ -7,6 +7,6 @@ * sigcontext struct (uc_mcontext). */ -#include +#include #endif /* _ASM_X86_UCONTEXT_H */ -- cgit v1.2.3-70-g09d2 From feaa0457ec8351cae855edc9a3052ac49322538e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 20 Jun 2009 16:15:40 +0530 Subject: x86: ds.c fix invalid assignment Fixes the type mixups that cause the following sparse warnings: CHECK arch/x86/kernel/ds.c arch/x86/kernel/ds.c:549:19: warning: incorrect type in argument 2 (invalid types) arch/x86/kernel/ds.c:549:19: expected bad type enum bts_field field arch/x86/kernel/ds.c:549:19: got int arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:514:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:514:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:514:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:514:7: error: invalid assignment arch/x86/kernel/ds.c:520:35: error: incompatible types for operation (*) arch/x86/kernel/ds.c:520:35: left side has type unsigned char static [unsigned] [toplevel] sizeof_ptr_field arch/x86/kernel/ds.c:520:35: right side has type bad type enum bts_field field arch/x86/kernel/ds.c:520:7: error: invalid assignment arch/x86/kernel/ds.c:520:35: error: incompatible types for operation (*) Signed-off-by: Jaswinder Singh Rajput Cc: Markus Metzger LKML-Reference: <1245494740.8613.12.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 48bfe138603..ef42a038f1a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -509,15 +509,15 @@ enum bts_field { bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; -static inline unsigned long bts_get(const char *base, enum bts_field field) +static inline unsigned long bts_get(const char *base, unsigned long field) { base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } -static inline void bts_set(char *base, enum bts_field field, unsigned long val) +static inline void bts_set(char *base, unsigned long field, unsigned long val) { - base += (ds_cfg.sizeof_ptr_field * field);; + base += (ds_cfg.sizeof_ptr_field * field); (*(unsigned long *)base) = val; } -- cgit v1.2.3-70-g09d2 From e487683990972bf9aa4e688434c46ead76748bca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:27:16 -0700 Subject: x86, mce: fix typo in comment in asm/mce.h Fix comment to match the actual declaration. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec..b50b9e9042c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ -- cgit v1.2.3-70-g09d2 From a95436e44a76a32dcbe7c8df59701ddde53017c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:28:22 -0700 Subject: x86, mce: use atomic_inc_return() instead of add by 1 Use atomic_inc_return() instead of atomic_add_return() by 1. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..7da8fec9ca8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -705,7 +705,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. -- cgit v1.2.3-70-g09d2 From c9944881acf02b6f25fa62a0441a98b7dc0d7ae6 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 24 Jun 2009 13:42:40 +0800 Subject: crypto: aes-ni - Don't print message with KERN_ERR on old system When the aes-intel module is loaded on a system that does not have the AES instructions, it prints Intel AES-NI instructions are not detected. at level KERN_ERR. Since aes-intel is aliased to "aes" it will be tried whenever anything uses AES and spam the console. This doesn't match existing practice for how to handle "no hardware" when initializing a module, so downgrade the message to KERN_INFO. Signed-off-by: Roland Dreier Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index c580c5ec1ca..d3ec8d588d4 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -636,7 +636,7 @@ static int __init aesni_init(void) int err; if (!cpu_has_aes) { - printk(KERN_ERR "Intel AES-NI instructions are not detected.\n"); + printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); return -ENODEV; } if ((err = crypto_register_alg(&aesni_alg))) -- cgit v1.2.3-70-g09d2 From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c39..05d86407188 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..328d2f8b8c3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05d..a774c2acbe6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d6..f4a3cc62d28 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eab..7a8698b913f 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f..a48a90076d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f05..e5000343dd6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c..602d724afa5 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2..f5934954fa9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd642669..c77c6487552 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a4735..df34ceae0c6 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd885..b14984566f5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-70-g09d2 From 204fba4aa303ea4a7bb726a539bf4a5b9e3203d0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:45 +0900 Subject: percpu: cleanup percpu array definitions Currently, the following three different ways to define percpu arrays are in use. 1. DEFINE_PER_CPU(elem_type[array_len], array_name); 2. DEFINE_PER_CPU(elem_type, array_name[array_len]); 3. DEFINE_PER_CPU(elem_type, array_name)[array_len]; Unify to #1 which correctly separates the roles of the two parameters and thus allows more flexibility in the way percpu variables are defined. [ Impact: cleanup ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Tony Luck Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Jeremy Fitzhardinge Cc: linux-mm@kvack.org Cc: Christoph Lameter Cc: David S. Miller --- arch/ia64/kernel/smp.c | 2 +- arch/ia64/sn/kernel/setup.c | 2 +- arch/powerpc/mm/stab.c | 2 +- arch/powerpc/platforms/ps3/smp.c | 2 +- arch/x86/kernel/cpu/cpu_debug.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- drivers/xen/events.c | 4 ++-- mm/quicklist.c | 2 +- mm/slub.c | 4 ++-- net/ipv4/syncookies.c | 2 +- net/ipv6/syncookies.c | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index f0c521b0ba4..94cf78ba28f 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,7 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; +static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index e456f062f24..ece1bf99449 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); EXPORT_PER_CPU_SYMBOL(__sn_hub_info); -DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); +DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid); EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 98cd1dc2ae7..6e9b69c9985 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -31,7 +31,7 @@ struct stab_entry { #define NR_STAB_CACHE_ENTRIES 8 static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); +static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); /* * Create a segment table entry for the given esid/vsid pair. diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index f6e04bcc70e..51ffde40af2 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -37,7 +37,7 @@ */ #define MSG_COUNT 4 -static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); +static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs); static void do_message_pass(int target, int msg) { diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd040..dca325c0399 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include #include -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bd..bd2a2fa8462 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f78..4946288d683 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 891d2e90753..ab581fa6268 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -47,10 +47,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4f..6eedf7e473d 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include #include -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 diff --git a/mm/slub.c b/mm/slub.c index ce62b770e2f..23bb79acc4b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2086,8 +2086,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index cd2b97f1b6e..84d90f2799b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,7 +37,7 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8c2513982b6..23d0d6db046 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,7 +74,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) -- cgit v1.2.3-70-g09d2 From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- block/as-iosched.c | 10 +++++----- block/cfq-iosched.c | 10 +++++----- drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------ drivers/cpufreq/cpufreq_ondemand.c | 15 ++++++++------- drivers/xen/events.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- mm/kmemleak-test.c | 6 +++--- mm/page-writeback.c | 5 +++-- net/ipv4/syncookies.c | 5 +++-- net/ipv6/syncookies.c | 5 +++-- 13 files changed, 58 insertions(+), 53 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968b..cba8cd3e957 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4946288d683..5fdf63aaaba 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static void @@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d..ce8ba57c655 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa6..0f1cc7d3855 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b..a7ef465c83b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,7 +65,7 @@ struct cpu_dbs_info_s { int cpu; unsigned int enable:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d172935..36f292a7bd0 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,7 +73,7 @@ struct cpu_dbs_info_s { unsigned int enable:1, sample_type:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i); dbs_info->freq_table = cpufreq_frequency_get_table(i); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ab581fa6268..7d2987e9b1b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea..1fd7a2e7575 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b..54b1de5074b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f52..177a5169bbd 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935..2c075dcf03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 84d90f2799b..a6e0e077ac3 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 23d0d6db046..6b6ae913b5d 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining -- cgit v1.2.3-70-g09d2 From 2495fbf7effa6868f5d74124ae9b22a57980755b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 26 Jun 2009 10:53:57 -0700 Subject: x86, setup: remove obsolete pre-Kconfig CONFIG_VIDEO_ variables There were a set of pre-Kconfig configuration variables defined in the video code. There is absolutely no evidence that they have been tweaked by anybody in modern history, so just get rid of them and hope nobody notices. If someone does complain, these should be made real Kconfig variables. Reported-by: Robert P. J. Day Signed-off-by: H. Peter Anvin --- arch/x86/boot/video-vesa.c | 7 +------ arch/x86/boot/video-vga.c | 10 ---------- arch/x86/boot/video.c | 5 ----- arch/x86/boot/video.h | 20 ++------------------ 4 files changed, 3 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index c700147d6ff..d7ef26ba454 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -31,7 +31,6 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { -#if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; @@ -49,8 +48,7 @@ static int vesa_probe(void) vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ -#endif /* CONFIG_VIDEO_VESA || CONFIG_FIRMWARE_EDID */ -#ifdef CONFIG_VIDEO_VESA + set_fs(vginfo.video_mode_ptr.seg); mode_ptr = vginfo.video_mode_ptr.off; @@ -102,9 +100,6 @@ static int vesa_probe(void) } return nmodes; -#else - return 0; -#endif /* CONFIG_VIDEO_VESA */ } static int vesa_set_mode(struct mode_info *mode) diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 8f8d827e254..819caa1f200 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -47,14 +47,6 @@ static u8 vga_set_basic_mode(void) initregs(&ireg); -#ifdef CONFIG_VIDEO_400_HACK - if (adapter >= ADAPTER_VGA) { - ireg.ax = 0x1202; - ireg.bx = 0x0030; - intcall(0x10, &ireg, NULL); - } -#endif - ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; @@ -62,11 +54,9 @@ static u8 vga_set_basic_mode(void) set_fs(0); rows = rdfs8(0x484); /* rows minus one */ -#ifndef CONFIG_VIDEO_400_HACK if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; -#endif if (mode != 3 && mode != 7) mode = 3; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index bad728b76fc..d42da380249 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -221,7 +221,6 @@ static unsigned int mode_menu(void) } } -#ifdef CONFIG_VIDEO_RETAIN /* Save screen content to the heap */ static struct saved_screen { int x, y; @@ -299,10 +298,6 @@ static void restore_screen(void) ireg.dl = saved.curx; intcall(0x10, &ireg, NULL); } -#else -#define save_screen() ((void)0) -#define restore_screen() ((void)0) -#endif void set_video(void) { diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index 5bb174a997f..ff339c5db31 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -17,19 +17,8 @@ #include -/* Enable autodetection of SVGA adapters and modes. */ -#undef CONFIG_VIDEO_SVGA - -/* Enable autodetection of VESA modes */ -#define CONFIG_VIDEO_VESA - -/* Retain screen contents when switching modes */ -#define CONFIG_VIDEO_RETAIN - -/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */ -#undef CONFIG_VIDEO_400_HACK - -/* This code uses an extended set of video mode numbers. These include: +/* + * This code uses an extended set of video mode numbers. These include: * Aliases for standard modes * NORMAL_VGA (-1) * EXTENDED_VGA (-2) @@ -67,13 +56,8 @@ /* The "recalculate timings" flag */ #define VIDEO_RECALC 0x8000 -/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */ -#ifdef CONFIG_VIDEO_RETAIN void store_screen(void); #define DO_STORE() store_screen() -#else -#define DO_STORE() ((void)0) -#endif /* CONFIG_VIDEO_RETAIN */ /* * Mode table structures -- cgit v1.2.3-70-g09d2 From 087975b06b00af9bf888fab6f94ae113c5cd80bd Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 27 Jun 2009 15:35:15 +0900 Subject: x86: Clean up dump_pagetable() Use pgtable access helpers for 32-bit version dump_pagetable() and get rid of __typeof__() operators. This needs to make pmd_pfn() available for 2-level pgtable. Also, remove some casts for 64-bit version dump_pagetable(). Signed-off-by: Akinobu Mita LKML-Reference: <20090627063514.GA2834@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 10 ++++----- arch/x86/mm/fault.c | 51 +++++++++++++++++------------------------- 2 files changed, 26 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3cc06e3fceb..af5481e9486 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -134,6 +134,11 @@ static inline unsigned long pte_pfn(pte_t pte) return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT; } +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -422,11 +427,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } -static inline unsigned long pmd_pfn(pmd_t pmd) -{ - return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; -} - static inline int pud_large(pud_t pud) { return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 78a5fff857b..9bf7e52c286 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -285,26 +285,25 @@ check_v8086_mode(struct pt_regs *regs, unsigned long address, tsk->thread.screen_bitmap |= 1 << bit; } -static void dump_pagetable(unsigned long address) +static bool low_pfn(unsigned long pfn) { - __typeof__(pte_val(__pte(0))) page; + return pfn < max_low_pfn; +} - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +static void dump_pagetable(unsigned long address) +{ + pgd_t *base = __va(read_cr3()); + pgd_t *pgd = &base[pgd_index(address)]; + pmd_t *pmd; + pte_t *pte; #ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); + printk("*pdpt = %016Lx ", pgd_val(*pgd)); + if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) + goto out; #endif + pmd = pmd_offset(pud_offset(pgd, address), address); + printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* * We must not directly access the pte in the highpte @@ -312,16 +311,12 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + goto out; + pte = pte_offset_kernel(pmd, address); + printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); +out: printk("\n"); } @@ -449,16 +444,12 @@ static int bad_address(void *p) static void dump_pagetable(unsigned long address) { - pgd_t *pgd; + pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); + pgd_t *pgd = base + pgd_index(address); pud_t *pud; pmd_t *pmd; pte_t *pte; - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - - pgd += pgd_index(address); if (bad_address(pgd)) goto bad; -- cgit v1.2.3-70-g09d2 From ce0c0f9eec2f377055e8b23c6fa192202381e022 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sun, 28 Jun 2009 18:07:39 +0800 Subject: x86, pgtable.h: Clean up types Use "unsigned long" consistently, not "unsigned". Signed-off-by: Figo.zhang LKML-Reference: <1246183659.2530.4.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index af5481e9486..9de8729c1c8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -356,7 +356,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) * this macro returns the index of the entry in the pmd page which would * control the given virtual address */ -static inline unsigned pmd_index(unsigned long address) +static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } @@ -376,7 +376,7 @@ static inline unsigned pmd_index(unsigned long address) * this function returns the index of the entry in the pte page which would * control the given virtual address */ -static inline unsigned pte_index(unsigned long address) +static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } @@ -462,7 +462,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned pud_index(unsigned long address) +static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } -- cgit v1.2.3-70-g09d2 From 565b0c1f100408ccbcb04ba458a14da454cb271d Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Mon, 29 Jun 2009 12:02:55 +0800 Subject: x86, highmem_32.c: Clean up comment Signed-off-by: Figo.zhang Cc: Andrew Morton LKML-Reference: <1246248175.5759.12.camel@myhost> Signed-off-by: Ingo Molnar --- arch/x86/mm/highmem_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 58f621e8191..0c6f43cee25 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -24,7 +24,7 @@ void kunmap(struct page *page) * no global lock is needed and because the kmap code must perform a global TLB * invalidation when the kmap pool wraps. * - * However when holding an atomic kmap is is not legal to sleep, so atomic + * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) -- cgit v1.2.3-70-g09d2 From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: percpu: drop @unit_size from embed first chunk allocator The only extra feature @unit_size provides is making dead space at the end of the first chunk which doesn't have any valid usecase. Drop the parameter. This will increase consistency with generalized 4k allocator. James Bottomley spotted missing conversion for the default setup_per_cpu_areas() which caused build breakage on all arcsh which use it. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 2 +- mm/percpu.c | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4..14728206fb5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + reserve - PERCPU_FIRST_CHUNK_RESERVE); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e5000343dd6..83bff053bd1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + ssize_t dyn_size); /* * Use this to get to a cpu's version of the per-cpu object diff --git a/mm/percpu.c b/mm/percpu.c index 19dd83b5cbd..fc6babe6e55 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * page size. * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. @@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) + ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; @@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, if (dyn_size != 0) dyn_size = pcpue_size - static_size - reserved_size; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, @@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, -1); + PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-70-g09d2 From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize 4k first chunk allocator Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk(). setup_pcpu_4k() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for consistency. [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 78 ++++++++++---------------------------- include/linux/percpu.h | 12 +++++- mm/percpu.c | 85 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 62 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 14728206fb5..ab896b31e80 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +{ + return pcpu_alloc_bootmem(cpu, size, size); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + /* * Large page remap allocator * @@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k page allocator + * 4k allocator * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. + * Boring fallback 4k allocator. This allocator puts more pressure on + * PTE TLBs but other than that behaves nicely on both UMA and NUMA. */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr) static ssize_t __init setup_pcpu_4k(size_t static_size) { - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; + return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpu4k_populate_pte); } /* for explicit first chunk allocator selection */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 83bff053bd1..41b5bfab419 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,18 +59,26 @@ extern void *pcpu_base_addr; typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + pcpu_fc_populate_pte_fn_t populate_pte_fn); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +extern ssize_t __init pcpu_4k_first_chunk( + size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index fc6babe6e55..27b0f40a3ea 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, pcpue_unit_size, pcpue_ptr, NULL); } +/* + * 4k page first chunk setup helper. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +/** + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); + goto enomem; + } + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + reserved_size, -1, + -1, NULL, populate_pte_fn); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + /* * Generic percpu area setup. * -- cgit v1.2.3-70-g09d2 From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: x86,percpu: generalize lpage first chunk allocator Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 9 -- arch/x86/kernel/setup_percpu.c | 169 +++------------------------------ arch/x86/mm/pageattr.c | 1 + include/linux/percpu.h | 27 ++++++ mm/percpu.c | 209 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 244 insertions(+), 171 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..a18c038a307 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -156,15 +156,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80..4f2e0ac9130 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size) } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Large page remapping allocator */ #ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void __init pcpul_map(void *ptr, size_t size, void *addr) { - size_t off = (size_t)pageno << PAGE_SHIFT; + pmd_t *pmd, pmd_v; - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + pmd = populate_extra_pmd((unsigned long)addr); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; @@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; + return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a896..c106f785242 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab419..9f6bfd7d4b9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, @@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#ifdef CONFIG_NEED_MULTIPLE_NODES +extern ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn); + +extern void *pcpu_lpage_remapped(void *kaddr); +#else +static inline ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + return -EINVAL; +} + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378..17db527ee2e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, return pcpu_unit_size; } +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /* * Embedding first chunk setup helper. */ @@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); @@ -1390,6 +1400,197 @@ out_free_ar: return ret; } +/* + * Large page remapping first chunk setup helper + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static size_t pcpul_unit_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; + +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpul_size) + return NULL; + + return virt_to_page(pcpul_map[cpu].ptr + off); +} + +/** + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @lpage_size: the size of a large page + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size + * @free_fn: function to free percpu memory, @size <= lpage_size + * @map_fn: function to map percpu lpage, always called with lpage_size + * + * This allocator uses large page as unit. A large page is allocated + * for each cpu and each is remapped into vmalloc area using large + * page mapping. As large page can be quite large, only part of it is + * used for the first chunk. Unused part is returned to the bootmem + * allocator. + * + * So, the large pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more large TLB entry pressure but still is + * much better than only using 4k mappings while still being NUMA + * friendly. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + size_t size_sum; + size_t map_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + + pcpul_unit_size = lpage_size; + pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + if (pcpul_size > pcpul_unit_size) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); + + for_each_possible_cpu(cpu) { + void *ptr; + + ptr = alloc_fn(cpu, lpage_size); + if (!ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); + goto enomem; + } + + /* + * Only use pcpul_size bytes and give back the rest. + * + * Ingo: The lpage_size up-rounding bootmem is needed + * to make sure the partial lpage is still fully RAM - + * it's not well-specified to have a incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_fn(ptr + pcpul_size, lpage_size - pcpul_size); + + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = ptr; + + memcpy(ptr, __per_cpu_load, static_size); + } + + /* allocate address and map */ + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; + vm_area_register_early(&pcpul_vm, pcpul_unit_size); + + for_each_possible_cpu(cpu) + map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, + pcpul_vm.addr + cpu * pcpul_unit_size); + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", pcpul_vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, + reserved_size, dyn_size, pcpul_unit_size, + pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; + +enomem: + for_each_possible_cpu(cpu) + if (pcpul_map[cpu].ptr) + free_fn(pcpul_map[cpu].ptr, pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu large + * page mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used large + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + unsigned long unit_mask = pcpul_unit_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); + unsigned long offset = (unsigned long)kaddr & unit_mask; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < lpage_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > lpage_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * pcpul_unit_size + offset; + } + } + + return NULL; +} +#endif + /* * Generic percpu area setup. * -- cgit v1.2.3-70-g09d2 From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: percpu: teach large page allocator about NUMA Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Jan Beulich Cc: Andi Kleen Cc: David Miller --- arch/x86/kernel/setup_percpu.c | 72 +++++++-- include/linux/percpu.h | 24 ++- mm/percpu.c | 358 ++++++++++++++++++++++++++++++++--------- 3 files changed, 359 insertions(+), 95 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130..7501bb14bd5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) set_pmd(pmd, pmd_v); } +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +{ + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; + size_t unit_map_size, unit_size; + int *unit_map; + int nr_units; + ssize_t ret; + + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) + return -EINVAL; + + /* need PSE */ + if (!cpu_has_pse) { + pr_warning("PERCPU: lpage allocator requires PSE\n"); + return -EINVAL; + } + /* allocate and build unit_map */ + unit_map_size = num_possible_cpus() * sizeof(int); + unit_map = alloc_bootmem_nopanic(unit_map_size); + if (!unit_map) { + pr_warning("PERCPU: failed to allocate unit_map\n"); + return -ENOMEM; + } + + ret = pcpu_lpage_build_unit_map(static_size, + PERCPU_FIRST_CHUNK_RESERVE, + &dyn_size, &unit_size, PMD_SIZE, + unit_map, pcpu_lpage_cpu_distance); + if (ret < 0) { + pr_warning("PERCPU: failed to build unit_map\n"); + goto out_free; + } + nr_units = ret; + + /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; + size_t tot_size = nr_units * unit_size; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - return -EINVAL; + ret = -EINVAL; + goto out_free; } } - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, unit_size, PMD_SIZE, + unit_map, nr_units, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); +out_free: + if (ret < 0) + free_bootmem(__pa(unit_map), unit_map_size); + return ret; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) @@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = + delta + pcpu_unit_map[cpu] * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2..8ce91af4aa1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( @@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_populate_pte_fn_t populate_pte_fn); #ifdef CONFIG_NEED_MULTIPLE_NODES +extern int __init pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + extern ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); extern void *pcpu_lpage_remapped(void *kaddr); #else +static inline int pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + return -EINVAL; +} + static inline ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f0..b3d0bcff8c7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1594,75 +1595,259 @@ out_free_ar: * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES + +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + struct pcpul_ent { - unsigned int cpu; void *ptr; + void *map_addr; }; static size_t pcpul_size; -static size_t pcpul_unit_size; +static size_t pcpul_lpage_size; +static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + if (unit_map[cpu] == unit) { + if (cpup) + *cpup = cpu; + return true; + } + + return false; +} + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes + * @unit_size: unit size in bytes * @lpage_size: the size of a large page + * @unit_map: cpu -> unit mapping + * @nr_units: the number of units * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * - * This allocator uses large page as unit. A large page is allocated - * for each cpu and each is remapped into vmalloc area using large - * page mapping. As large page can be quite large, only part of it is - * used for the first chunk. Unused part is returned to the bootmem - * allocator. - * - * So, the large pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more large TLB entry pressure but still is - * much better than only using 4k mappings while still being NUMA - * friendly. + * This allocator uses large page to build and map the first chunk. + * Unlike other helpers, the caller should always specify @dyn_size + * and @unit_size. These parameters along with @unit_map and + * @nr_units can be determined using pcpu_lpage_build_unit_map(). + * This two stage initialization is to allow arch code to evaluate the + * parameters before committing to it. + * + * Large pages are allocated as directed by @unit_map and other + * parameters and mapped to vmalloc space. Unused holes are returned + * to the page allocator. Note that these holes end up being actively + * mapped twice - once to the physical mapping and to the vmalloc area + * for the first percpu chunk. Depending on architecture, this might + * cause problem when changing page attributes of the returned area. + * These double mapped areas can be detected using + * pcpu_lpage_remapped(). * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { - size_t size_sum; + static struct vm_struct vm; + size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; - int i, j; ssize_t ret; + int i, j, unit; - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, + unit_size, lpage_size, unit_map, nr_units); - pcpul_unit_size = lpage_size; - pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - if (pcpul_size > pcpul_unit_size) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } + BUG_ON(chunk_size % lpage_size); + + pcpul_size = static_size + reserved_size + dyn_size; + pcpul_lpage_size = lpage_size; + pcpul_nr_lpages = chunk_size / lpage_size; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); pcpul_map = alloc_bootmem(map_size); - for_each_possible_cpu(cpu) { + /* allocate all pages */ + for (i = 0; i < pcpul_nr_lpages; i++) { + size_t offset = i * lpage_size; + int first_unit = offset / unit_size; + int last_unit = (offset + lpage_size - 1) / unit_size; void *ptr; + /* find out which cpu is mapped to this unit */ + for (unit = first_unit; unit <= last_unit; unit++) + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + goto found; + continue; + found: ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " @@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, goto enomem; } - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The lpage_size up-rounding bootmem is needed - * to make sure the partial lpage is still fully RAM - - * it's not well-specified to have a incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_fn(ptr + pcpul_size, lpage_size - pcpul_size); - - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = ptr; + pcpul_map[i].ptr = ptr; + } - memcpy(ptr, __per_cpu_load, static_size); + /* return unused holes */ + for (unit = 0; unit < nr_units; unit++) { + size_t start = unit * unit_size; + size_t end = start + unit_size; + size_t off, next; + + /* don't free used part of occupied unit */ + if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + start += pcpul_size; + + /* unit can span more than one page, punch the holes */ + for (off = start; off < end; off = next) { + void *ptr = pcpul_map[off / lpage_size].ptr; + next = min(roundup(off + 1, lpage_size), end); + if (ptr) + free_fn(ptr + off % lpage_size, next - off); + } } - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; - vm_area_register_early(&pcpul_vm, pcpul_unit_size); + /* allocate address, map and copy */ + vm.flags = VM_ALLOC; + vm.size = chunk_size; + vm_area_register_early(&vm, unit_size); + + for (i = 0; i < pcpul_nr_lpages; i++) { + if (!pcpul_map[i].ptr) + continue; + pcpul_map[i].map_addr = vm.addr + i * lpage_size; + map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); + } for_each_possible_cpu(cpu) - map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, - pcpul_vm.addr + cpu * pcpul_unit_size); + memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, + static_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); + "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } + unit_size, vm.addr, unit_map); + + /* + * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped + * lpages are pushed to the end and trimmed. + */ + for (i = 0; i < pcpul_nr_lpages - 1; i++) + for (j = i + 1; j < pcpul_nr_lpages; j++) { + struct pcpul_ent tmp; + + if (!pcpul_map[j].ptr) + continue; + if (pcpul_map[i].ptr && + pcpul_map[i].ptr < pcpul_map[j].ptr) + continue; + + tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) + pcpul_nr_lpages--; return ret; enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_fn(pcpul_map[cpu].ptr, pcpul_size); + for (i = 0; i < pcpul_nr_lpages; i++) + if (pcpul_map[i].ptr) + free_fn(pcpul_map[i].ptr, lpage_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } @@ -1739,10 +1950,10 @@ enomem: */ void *pcpu_lpage_remapped(void *kaddr) { - unsigned long unit_mask = pcpul_unit_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); - unsigned long offset = (unsigned long)kaddr & unit_mask; - int left = 0, right = num_possible_cpus() - 1; + unsigned long lpage_mask = pcpul_lpage_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); + unsigned long offset = (unsigned long)kaddr & lpage_mask; + int left = 0, right = pcpul_nr_lpages - 1; int pos; /* pcpul in use at all? */ @@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * pcpul_unit_size + offset; - } + else + return pcpul_map[pos].map_addr + offset; } return NULL; -- cgit v1.2.3-70-g09d2 From 42204455f160dab0c47f19e1be23f5c927af2d17 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:50:00 +0530 Subject: x86: Clean up mtrr/amd.c: Fix trivial style problems : ERROR: trailing whitespace WARNING: line over 80 characters ERROR: do not use C99 // comments arch/x86/kernel/cpu/mtrr/amd.o: text data bss dec hex filename 501 32 0 533 215 amd.o.before 501 32 0 533 215 amd.o.after md5: 62f795eb840ee2d17b03df89e789e76c amd.o.before.asm 62f795eb840ee2d17b03df89e789e76c amd.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Also restructured comments to be standard, removed stray return, converted function description to DocBook style, etc. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/amd.c | 97 ++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index ee2331b0e58..33af14110df 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -7,15 +7,15 @@ static void amd_get_mtrr(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) + unsigned long *size, mtrr_type *type) { unsigned long low, high; rdmsr(MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ + /* Upper dword is region 1, lower is region 0 */ if (reg == 1) low = high; - /* The base masks off on the right alignment */ + /* The base masks off on the right alignment */ *base = (low & 0xFFFE0000) >> PAGE_SHIFT; *type = 0; if (low & 1) @@ -27,74 +27,81 @@ amd_get_mtrr(unsigned int reg, unsigned long *base, return; } /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. * - * eg 111 1111 1111 1100 is 512K + * eg 111 1111 1111 1100 is 512K * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... */ low = (~low) & 0x1FFFC; *size = (low + 4) << (15 - PAGE_SHIFT); - return; } -static void amd_set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - [RETURNS] Nothing. -*/ +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { u32 regs[2]; /* - * Low is MTRR0 , High MTRR 1 + * Low is MTRR0, High MTRR 1 */ rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); /* - * Blank to disable + * Blank to disable */ - if (size == 0) + if (size == 0) { regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) | (base << PAGE_SHIFT) | (type + 1); + } /* - * The writeback rule is quite specific. See the manual. Its - * disable local interrupts, write back the cache, set the mtrr + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr */ wbinvd(); wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); } -static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) || (size & ~(size - 1)) - size || (base & (size - 1))) return -EINVAL; @@ -115,5 +122,3 @@ int __init amd_init_mtrr(void) set_mtrr_ops(&amd_mtrr_ops); return 0; } - -//arch_initcall(amd_mtrr_init); -- cgit v1.2.3-70-g09d2 From 6c4caa1ab737502190e416b76e6c10d2bf24276a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:50:44 +0530 Subject: x86: Clean up mtrr/centaur.c Remove dead code and fix trivial style problems: ERROR: trailing whitespace X 2 WARNING: line over 80 characters X 3 ROR: trailing whitespace ERROR: do not use C99 // comments X 2 arch/x86/kernel/cpu/mtrr/centaur.o: text data bss dec hex filename 605 32 68 705 2c1 centaur.o.before 605 32 68 705 2c1 centaur.o.after md5: a4865ea98ce3c163bb1d376a3949b3e3 centaur.o.before.asm a4865ea98ce3c163bb1d376a3949b3e3 centaur.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Standardized comments, DocBook, curly braces, newlines. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/centaur.c | 168 ++++++++----------------------------- 1 file changed, 35 insertions(+), 133 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index cb9aa3a7a7a..de89f14eff3 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -1,7 +1,9 @@ #include #include + #include #include + #include "mtrr.h" static struct { @@ -12,25 +14,25 @@ static struct { static u8 centaur_mcr_reserved; static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ -/* - * Report boot time MCR setups +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. */ - static int centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { if (centaur_mcr_reserved & (1 << i)) continue; @@ -38,11 +40,14 @@ centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) if (lsize == 0) return i; } + return -ENOSPC; } -void -mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { centaur_mcr[mcr].low = lo; centaur_mcr[mcr].high = hi; @@ -54,33 +59,35 @@ centaur_get_mcr(unsigned int reg, unsigned long *base, { *base = centaur_mcr[reg].high >> PAGE_SHIFT; *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) *type = MTRR_TYPE_UNCACHABLE; if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) *type = MTRR_TYPE_WRBACK; if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) *type = MTRR_TYPE_WRBACK; - } -static void centaur_set_mcr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) { unsigned long low, high; if (size == 0) { - /* Disable */ + /* Disable */ high = low = 0; } else { high = base << PAGE_SHIFT; - if (centaur_mcr_type == 0) - low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ - else { + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { if (type == MTRR_TYPE_UNCACHABLE) - low = -size << PAGE_SHIFT | 0x02; /* NC */ + low = -size << PAGE_SHIFT | 0x02; /* NC */ else - low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ } } centaur_mcr[reg].high = high; @@ -88,118 +95,16 @@ static void centaur_set_mcr(unsigned int reg, unsigned long base, wrmsr(MSR_IDT_MCR0 + reg, low, high); } -#if 0 -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init -centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ - lo &= ~0x1C0; /* clear key */ - lo |= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { - if (!(lo & (1 << (9 + i)))) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1 << i); - } - } - /* - * Throw the main write-combining switch... - * However if OOSTORE is enabled then people have already done far - * cleverer things and we should behave. - */ - - lo |= 15; /* Write combine enables */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -/* - * Initialise the original winchip with read only MCR registers - * no used bitmask for the BIOS to pass on and write only control - */ - -static void __init -centaur_mcr0_init(void) -{ - unsigned i; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - /* Clear any unconfigured MCR's. - * This way we are sure that the centaur_mcr array contains the actual - * values. The disadvantage is that any BIOS tweaks are thus undone. - * - */ - for (i = 0; i < 8; ++i) { - if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) - wrmsr(MSR_IDT_MCR0 + i, 0, 0); - } - - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ -} - -/* - * Initialise Winchip series MCR registers - */ - -static void __init -centaur_mcr_init(void) -{ - struct set_mtrr_context ctxt; - - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - if (boot_cpu_data.x86_model == 4) - centaur_mcr0_init(); - else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) - centaur_mcr1_init(); - - set_mtrr_done(&ctxt); -} -#endif - -static int centaur_validate_add_page(unsigned long base, - unsigned long size, unsigned int type) +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) { /* - * FIXME: Winchip2 supports uncached + * FIXME: Winchip2 supports uncached */ - if (type != MTRR_TYPE_WRCOMB && + if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { - printk(KERN_WARNING - "mtrr: only write-combining%s supported\n", - centaur_mcr_type ? " and uncacheable are" - : " is"); + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); return -EINVAL; } return 0; @@ -207,7 +112,6 @@ static int centaur_validate_add_page(unsigned long base, static struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, -// .init = centaur_mcr_init, .set = centaur_set_mcr, .get = centaur_get_mcr, .get_free_region = centaur_get_free_region, @@ -220,5 +124,3 @@ int __init centaur_init_mtrr(void) set_mtrr_ops(¢aur_mtrr_ops); return 0; } - -//arch_initcall(centaur_init_mtrr); -- cgit v1.2.3-70-g09d2 From 63f9600fadb10ea739108ae93e3e842d9843c58b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:51:32 +0530 Subject: x86: Clean up mtrr/cleanup.c Fix trivial style problems: WARNING: Use #include instead of WARNING: Use #include instead of Also, nr_mtrr_spare_reg should be unsigned long. arch/x86/kernel/cpu/mtrr/cleanup.o: text data bss dec hex filename 6241 8992 2056 17289 4389 cleanup.o.before 6241 8992 2056 17289 4389 cleanup.o.after The md5 has changed: 1a7a27513aef1825236daf29110fe657 cleanup.o.before.asm bcea358efa2532b6020e338e158447af cleanup.o.after.asm Because a WARN_ON()'s __LINE__ value changed by 3 lines. Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Did lots of other cleanups to make the code look more consistent. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 350 +++++++++++++++++++------------------ 1 file changed, 176 insertions(+), 174 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 1d584a18a50..b8aba811b60 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -1,51 +1,52 @@ -/* MTRR (Memory Type Range Register) cleanup - - Copyright (C) 2009 Yinghai Lu - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ #include #include #include #include #include -#include #include +#include +#include +#include +#include #include #include -#include -#include #include -#include + #include "mtrr.h" -/* should be related to MTRR_VAR_RANGES nums */ +/* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 struct res_range { - unsigned long start; - unsigned long end; + unsigned long start; + unsigned long end; }; static int __init -add_range(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { - /* out of slots */ + /* Out of slots: */ if (nr_range >= RANGE_NUM) return nr_range; @@ -58,12 +59,12 @@ add_range(struct res_range *range, int nr_range, unsigned long start, } static int __init -add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, - unsigned long end) +add_range_with_merge(struct res_range *range, int nr_range, + unsigned long start, unsigned long end) { int i; - /* try to merge it with old one */ + /* Try to merge it with old one: */ for (i = 0; i < nr_range; i++) { unsigned long final_start, final_end; unsigned long common_start, common_end; @@ -84,7 +85,7 @@ add_range_with_merge(struct res_range *range, int nr_range, unsigned long start, return nr_range; } - /* need to add that */ + /* Need to add it: */ return add_range(range, nr_range, start, end); } @@ -117,7 +118,7 @@ subtract_range(struct res_range *range, unsigned long start, unsigned long end) } if (start > range[j].start && end < range[j].end) { - /* find the new spare */ + /* Find the new spare: */ for (i = 0; i < RANGE_NUM; i++) { if (range[i].end == 0) break; @@ -147,13 +148,19 @@ static int __init cmp_range(const void *x1, const void *x2) } struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; }; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init x86_get_mtrr_mem_range(struct res_range *range, int nr_range, @@ -180,7 +187,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, range[i].start, range[i].end + 1); } - /* take out UC ranges */ + /* Take out UC ranges: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_UNCACHABLE && @@ -244,10 +251,9 @@ static int __initdata nr_range; static unsigned long __init sum_ranges(struct res_range *range, int nr_range) { - unsigned long sum; + unsigned long sum = 0; int i; - sum = 0; for (i = 0; i < nr_range; i++) sum += range[i].end + 1 - range[i].start; @@ -288,7 +294,7 @@ struct var_mtrr_state { static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type, unsigned int address_bits) + unsigned char type, unsigned int address_bits) { u32 base_lo, base_hi, mask_lo, mask_hi; u64 base, mask; @@ -301,7 +307,7 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, mask = (1ULL << address_bits) - 1; mask &= ~((((u64)sizek) << 10) - 1); - base = ((u64)basek) << 10; + base = ((u64)basek) << 10; base |= type; mask |= 0x800; @@ -317,15 +323,14 @@ set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, static void __init save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, - unsigned char type) + unsigned char type) { range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); range_state[reg].type = type; } -static void __init -set_var_mtrr_all(unsigned int address_bits) +static void __init set_var_mtrr_all(unsigned int address_bits) { unsigned long basek, sizek; unsigned char type; @@ -342,11 +347,11 @@ set_var_mtrr_all(unsigned int address_bits) static unsigned long to_size_factor(unsigned long sizek, char *factorp) { - char factor; unsigned long base = sizek; + char factor; if (base & ((1<<10) - 1)) { - /* not MB alignment */ + /* Not MB-aligned: */ factor = 'K'; } else if (base & ((1<<20) - 1)) { factor = 'M'; @@ -372,11 +377,12 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long max_align, align; unsigned long sizek; - /* Compute the maximum size I can make a range */ + /* Compute the maximum size with which we can make a range: */ if (range_startk) max_align = ffs(range_startk) - 1; else max_align = 32; + align = fls(range_sizek) - 1; if (align > max_align) align = max_align; @@ -386,11 +392,10 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; - start_base = to_size_factor(range_startk, - &start_factor), - size_base = to_size_factor(sizek, &size_factor), + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); - printk(KERN_DEBUG "Setting variable MTRR %d, " + Dprintk("Setting variable MTRR %d, " "base: %ld%cB, range: %ld%cB, type %s\n", reg, start_base, start_factor, size_base, size_factor, @@ -425,10 +430,11 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, chunk_sizek = state->chunk_sizek; gran_sizek = state->gran_sizek; - /* align with gran size, prevent small block used up MTRRs */ + /* Align with gran size, prevent small block used up MTRRs: */ range_basek = ALIGN(state->range_startk, gran_sizek); if ((range_basek > basek) && basek) return second_sizek; + state->range_sizek -= (range_basek - state->range_startk); range_sizek = ALIGN(state->range_sizek, gran_sizek); @@ -439,22 +445,21 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } state->range_sizek = range_sizek; - /* try to append some small hole */ + /* Try to append some small hole: */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); - /* no increase */ + /* No increase: */ if (range0_sizek == state->range_sizek) { - if (debug_print) - printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + state->range_sizek)<<10); + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, state->range_sizek, MTRR_TYPE_WRBACK); return 0; } - /* only cut back, when it is not the last */ + /* Only cut back when it is not the last: */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { if (range0_sizek >= chunk_sizek) @@ -470,16 +475,16 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_try: range_basek = range0_basek + range0_sizek; - /* one hole in the middle */ + /* One hole in the middle: */ if (range_basek > basek && range_basek <= (basek + sizek)) second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - /* one hole in middle or at end */ + /* One hole in middle or at the end: */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - /* hole size should be less than half of range0 size */ + /* Hole size should be less than half of range0 size: */ if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; @@ -491,32 +496,30 @@ second_try: } if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); state->reg = range_to_mtrr(state->reg, range0_basek, range0_sizek, MTRR_TYPE_WRBACK); } if (range0_sizek < state->range_sizek) { - /* need to handle left over */ + /* Need to handle left over range: */ range_sizek = state->range_sizek - range0_sizek; - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, MTRR_TYPE_WRBACK); } if (hole_sizek) { hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, MTRR_TYPE_UNCACHABLE); } @@ -537,23 +540,23 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, basek = base_pfn << (PAGE_SHIFT - 10); sizek = size_pfn << (PAGE_SHIFT - 10); - /* See if I can merge with the last range */ + /* See if I can merge with the last range: */ if ((basek <= 1024) || (state->range_startk + state->range_sizek == basek)) { unsigned long endk = basek + sizek; state->range_sizek = endk - state->range_startk; return; } - /* Write the range mtrrs */ + /* Write the range mtrrs: */ if (state->range_sizek != 0) second_sizek = range_to_mtrr_with_hole(state, basek, sizek); - /* Allocate an msr */ + /* Allocate an msr: */ state->range_startk = basek + second_sizek; state->range_sizek = sizek - second_sizek; } -/* mininum size of mtrr block that can take hole */ +/* Mininum size of mtrr block that can take hole: */ static u64 mtrr_chunk_size __initdata = (256ULL<<20); static int __init parse_mtrr_chunk_size_opt(char *p) @@ -565,7 +568,7 @@ static int __init parse_mtrr_chunk_size_opt(char *p) } early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); -/* granity of mtrr of block */ +/* Granularity of mtrr of block: */ static u64 mtrr_gran_size __initdata; static int __init parse_mtrr_gran_size_opt(char *p) @@ -577,7 +580,7 @@ static int __init parse_mtrr_gran_size_opt(char *p) } early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); -static int nr_mtrr_spare_reg __initdata = +static unsigned long nr_mtrr_spare_reg __initdata = CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; static int __init parse_mtrr_spare_reg(char *arg) @@ -586,7 +589,6 @@ static int __init parse_mtrr_spare_reg(char *arg) nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); return 0; } - early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init @@ -594,8 +596,8 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; - int i; int num_reg; + int i; var_state.range_startk = 0; var_state.range_sizek = 0; @@ -605,17 +607,18 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, memset(range_state, 0, sizeof(range_state)); - /* Write the range etc */ - for (i = 0; i < nr_range; i++) + /* Write the range: */ + for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, range[i].end - range[i].start + 1); + } - /* Write the last range */ + /* Write the last range: */ if (var_state.range_sizek != 0) range_to_mtrr_with_hole(&var_state, 0, 0); num_reg = var_state.reg; - /* Clear out the extra MTRR's */ + /* Clear out the extra MTRR's: */ while (var_state.reg < num_var_ranges) { save_var_mtrr(var_state.reg, 0, 0, 0); var_state.reg++; @@ -625,11 +628,11 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range, } struct mtrr_cleanup_result { - unsigned long gran_sizek; - unsigned long chunk_sizek; - unsigned long lose_cover_sizek; - unsigned int num_reg; - int bad; + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; }; /* @@ -645,10 +648,10 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static void __init print_out_mtrr_range_state(void) { - int i; char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; mtrr_type type; + int i; for (i = 0; i < num_var_ranges; i++) { @@ -676,10 +679,10 @@ static int __init mtrr_need_cleanup(void) int i; mtrr_type type; unsigned long size; - /* extra one for all 0 */ + /* Extra one for all 0: */ int num[MTRR_NUM_TYPES + 1]; - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -693,88 +696,86 @@ static int __init mtrr_need_cleanup(void) num[type]++; } - /* check if we got UC entries */ + /* Check if we got UC entries: */ if (!num[MTRR_TYPE_UNCACHABLE]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != - num_var_ranges - num[MTRR_NUM_TYPES]) + num_var_ranges - num[MTRR_NUM_TYPES]) return 0; return 1; } static unsigned long __initdata range_sums; -static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size, - unsigned long extra_remove_base, - unsigned long extra_remove_size, - int i) + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) { - int num_reg; static struct res_range range_new[RANGE_NUM]; - static int nr_range_new; unsigned long range_sums_new; + static int nr_range_new; + int num_reg; - /* convert ranges to var ranges state */ - num_reg = x86_setup_var_mtrrs(range, nr_range, - chunk_size, gran_size); + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); - /* we got new setting in range_state, check it */ + /* We got new setting in range_state, check it: */ memset(range_new, 0, sizeof(range_new)); nr_range_new = x86_get_mtrr_mem_range(range_new, 0, - extra_remove_base, extra_remove_size); + x_remove_base, x_remove_size); range_sums_new = sum_ranges(range_new, nr_range_new); result[i].chunk_sizek = chunk_size >> 10; result[i].gran_sizek = gran_size >> 10; result[i].num_reg = num_reg; + if (range_sums < range_sums_new) { - result[i].lose_cover_sizek = - (range_sums_new - range_sums) << PSHIFT; + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; result[i].bad = 1; - } else - result[i].lose_cover_sizek = - (range_sums - range_sums_new) << PSHIFT; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } - /* double check it */ + /* Double check it: */ if (!result[i].bad && !result[i].lose_cover_sizek) { - if (nr_range_new != nr_range || - memcmp(range, range_new, sizeof(range))) - result[i].bad = 1; + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; } - if (!result[i].bad && (range_sums - range_sums_new < - min_loss_pfn[num_reg])) { - min_loss_pfn[num_reg] = - range_sums - range_sums_new; - } + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; } static void __init mtrr_print_out_one_result(int i) { - char gran_factor, chunk_factor, lose_factor; unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), - printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", - result[i].bad ? "*BAD*" : " ", - gran_base, gran_factor, chunk_base, chunk_factor); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", - result[i].num_reg, result[i].bad ? "-" : "", - lose_base, lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); } static int __init mtrr_search_optimal_index(void) { - int i; int num_reg_good; int index_good; + int i; if (nr_mtrr_spare_reg >= num_var_ranges) nr_mtrr_spare_reg = num_var_ranges - 1; + num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { if (!min_loss_pfn[i]) @@ -796,24 +797,24 @@ static int __init mtrr_search_optimal_index(void) return index_good; } - int __init mtrr_cleanup(unsigned address_bits) { - unsigned long extra_remove_base, extra_remove_size; + unsigned long x_remove_base, x_remove_size; unsigned long base, size, def, dummy; - mtrr_type type; u64 chunk_size, gran_size; + mtrr_type type; int index_good; int i; if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -822,29 +823,28 @@ int __init mtrr_cleanup(unsigned address_bits) range_state[i].type = type; } - /* check if we need handle it and can handle it */ + /* Check if we need handle it and can handle it: */ if (!mtrr_need_cleanup()) return 0; - /* print original var MTRRs at first, for debugging: */ + /* Print original var MTRRs at first, for debugging: */ printk(KERN_DEBUG "original variable MTRRs\n"); print_out_mtrr_range_state(); memset(range, 0, sizeof(range)); - extra_remove_size = 0; - extra_remove_base = 1 << (32 - PAGE_SHIFT); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); if (mtrr_tom2) - extra_remove_size = - (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, - extra_remove_size); + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* - * [0, 1M) should always be coverred by var mtrr with WB - * and fixed mtrrs should take effective before var mtrr for it + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); - /* sort the ranges */ + /* Sort the ranges: */ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); @@ -854,7 +854,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { i = 0; mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); mtrr_print_out_one_result(i); @@ -880,7 +880,7 @@ int __init mtrr_cleanup(unsigned address_bits) continue; mtrr_calc_range_state(chunk_size, gran_size, - extra_remove_base, extra_remove_size, i); + x_remove_base, x_remove_size, i); if (debug_print) { mtrr_print_out_one_result(i); printk(KERN_INFO "\n"); @@ -890,7 +890,7 @@ int __init mtrr_cleanup(unsigned address_bits) } } - /* try to find the optimal index */ + /* Try to find the optimal index: */ index_good = mtrr_search_optimal_index(); if (index_good != -1) { @@ -898,7 +898,7 @@ int __init mtrr_cleanup(unsigned address_bits) i = index_good; mtrr_print_out_one_result(i); - /* convert ranges to var ranges state */ + /* Convert ranges to var ranges state: */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; gran_size = result[i].gran_sizek; @@ -941,8 +941,8 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup); * Note this won't check if the MTRRs < 4GB where the magic bit doesn't * apply to are wrong, but so far we don't know of any such case in the wild. */ -#define Tom2Enabled (1U << 21) -#define Tom2ForceMemTypeWB (1U << 22) +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) int __init amd_special_default_mtrr(void) { @@ -952,7 +952,7 @@ int __init amd_special_default_mtrr(void) return 0; if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; - /* In case some hypervisor doesn't pass SYSCFG through */ + /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) return 0; /* @@ -965,19 +965,21 @@ int __init amd_special_default_mtrr(void) return 0; } -static u64 __init real_trim_memory(unsigned long start_pfn, - unsigned long limit_pfn) +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) { u64 trim_start, trim_size; + trim_start = start_pfn; trim_start <<= PAGE_SHIFT; + trim_size = limit_pfn; trim_size <<= PAGE_SHIFT; trim_size -= trim_start; - return e820_update_range(trim_start, trim_size, E820_RAM, - E820_RESERVED); + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); } + /** * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs * @end_pfn: ending page frame number @@ -985,7 +987,7 @@ static u64 __init real_trim_memory(unsigned long start_pfn, * Some buggy BIOSes don't setup the MTRRs properly for systems with certain * memory configurations. This routine checks that the highest MTRR matches * the end of memory, to make sure the MTRRs having a write back type cover - * all of the memory the kernel is intending to use. If not, it'll trim any + * all of the memory the kernel is intending to use. If not, it'll trim any * memory off the end by adjusting end_pfn, removing it from the kernel's * allocation pools, warning the user with an obnoxious message. */ @@ -994,21 +996,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) unsigned long i, base, size, highest_pfn = 0, def, dummy; mtrr_type type; u64 total_trim_size; - /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; + /* * Make sure we only trim uncachable memory on machines that * support the Intel MTRR architecture: */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; - /* get it and store it aside */ + /* Get it and store it aside: */ memset(range_state, 0, sizeof(range_state)); for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, &base, &size, &type); @@ -1017,7 +1020,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) range_state[i].type = type; } - /* Find highest cached pfn */ + /* Find highest cached pfn: */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; if (type != MTRR_TYPE_WRBACK) @@ -1028,13 +1031,13 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) highest_pfn = base + size; } - /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ if (!highest_pfn) { printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); return 0; } - /* check entries number */ + /* Check entries number: */ memset(num, 0, sizeof(num)); for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; @@ -1046,11 +1049,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) num[type]++; } - /* no entry for WB? */ + /* No entry for WB? */ if (!num[MTRR_TYPE_WRBACK]) return 0; - /* check if we only had WB and UC */ + /* Check if we only had WB and UC: */ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != num_var_ranges - num[MTRR_NUM_TYPES]) return 0; @@ -1066,31 +1069,31 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + /* Check the head: */ total_trim_size = 0; - /* check the head */ if (range[0].start) total_trim_size += real_trim_memory(0, range[0].start); - /* check the holes */ + + /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { if (range[i].end + 1 < range[i+1].start) total_trim_size += real_trim_memory(range[i].end + 1, range[i+1].start); } - /* check the top */ + + /* Check the top: */ i = nr_range - 1; if (range[i].end + 1 < end_pfn) total_trim_size += real_trim_memory(range[i].end + 1, end_pfn); if (total_trim_size) { - printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" - " all of memory, losing %lluMB of RAM.\n", - total_trim_size >> 20); + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); if (!changed_by_mtrr_cleanup) WARN_ON(1); - printk(KERN_INFO "update e820 for mtrr\n"); + pr_info("update e820 for mtrr\n"); update_e820(); return 1; @@ -1098,4 +1101,3 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) return 0; } - -- cgit v1.2.3-70-g09d2 From 2311037708c170977506fbcbe0a2ba0c6d221940 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:52:08 +0530 Subject: x86: Clean up mtrr/cyrix.c Fix trivial style problems: WARNING: Use #include instead of WARNING: line over 80 characters ERROR: do not initialise statics to 0 or NULL ERROR: space prohibited after that open parenthesis '(' X 2 ERROR: space prohibited before that close parenthesis ')' X 2 ERROR: trailing whitespace X 2 ERROR: trailing statements should be on next line ERROR: do not use C99 // comments X 2 arch/x86/kernel/cpu/mtrr/cyrix.o: text data bss dec hex filename 1637 32 8 1677 68d cyrix.o.before 1637 32 8 1677 68d cyrix.o.after md5: 6f52abd06905be3f4cabb5239f9b0ff0 cyrix.o.before.asm 6f52abd06905be3f4cabb5239f9b0ff0 cyrix.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Made the code more consistent ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cyrix.c | 94 ++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index ff14c320040..228d982ce09 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -1,38 +1,40 @@ #include +#include #include -#include -#include -#include + #include #include +#include +#include + #include "mtrr.h" static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) { - unsigned long flags; unsigned char arr, ccr3, rcr, shift; + unsigned long flags; arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - /* Save flags and disable interrupts */ local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86(arr); - ((unsigned char *) base)[2] = getCx86(arr + 1); - ((unsigned char *) base)[1] = getCx86(arr + 2); + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); rcr = getCx86(CX86_RCR_BASE + reg); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ - /* Enable interrupts if it was enabled previously */ local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; *base >>= PAGE_SHIFT; - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 * Note: shift==0xf means 4G, this is unsupported. */ if (shift) @@ -76,17 +78,20 @@ cyrix_get_arr(unsigned int reg, unsigned long *base, } } +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ static int cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) -/* [SUMMARY] Get a free ARR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ { - int i; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i; switch (replace_reg) { case 7: @@ -107,14 +112,17 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) cyrix_get_arr(7, &lbase, &lsize, <ype); if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ + /* Else try ARR0-ARR6 first */ } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ cyrix_get_arr(i, &lbase, &lsize, <ype); if ((lsize == 0) && (size >= 0x40)) return i; @@ -122,21 +130,22 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) return -ENOSPC; } -static u32 cr4 = 0; -static u32 ccr3; +static u32 cr4, ccr3; static void prepare_set(void) { u32 cr0; /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); @@ -147,22 +156,21 @@ static void prepare_set(void) /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); - } static void post_set(void) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ccr3); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); } @@ -178,7 +186,8 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, size >>= 6; size &= 0x7fff; /* make sure arr_size <= 14 */ - for (arr_size = 0; size; arr_size++, size >>= 1) ; + for (arr_size = 0; size; arr_size++, size >>= 1) + ; if (reg < 7) { switch (type) { @@ -215,18 +224,18 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base, prepare_set(); base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr + 1, ((unsigned char *) &base)[2]); - setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); setCx86(CX86_RCR_BASE + reg, arr_type); post_set(); } typedef struct { - unsigned long base; - unsigned long size; - mtrr_type type; + unsigned long base; + unsigned long size; + mtrr_type type; } arr_state_t; static arr_state_t arr_state[8] = { @@ -247,16 +256,17 @@ static void cyrix_set_all(void) setCx86(CX86_CCR0 + i, ccr_state[i]); for (; i < 7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for (i = 0; i < 8; i++) - cyrix_set_arr(i, arr_state[i].base, + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, arr_state[i].size, arr_state[i].type); + } post_set(); } static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, -// .init = cyrix_arr_init, .set_all = cyrix_set_all, .set = cyrix_set_arr, .get = cyrix_get_arr, @@ -270,5 +280,3 @@ int __init cyrix_init_mtrr(void) set_mtrr_ops(&cyrix_mtrr_ops); return 0; } - -//arch_initcall(cyrix_init_mtrr); -- cgit v1.2.3-70-g09d2 From a1a499a39911fcfecbebaba1f38588088909f918 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:53:00 +0530 Subject: x86: Clean up mtrr/generic.c Fix following trivial style problems: ERROR: trailing whitespace X 4 WARNING: Use #include instead of WARNING: braces {} are not necessary for single statement blocks X 3 ERROR: "foo * bar" should be "foo *bar" WARNING: line over 80 characters X 6 ERROR: "foo * bar" should be "foo *bar" ERROR: spaces required around that '=' (ctx:VxO) ERROR: space required before that '-' (ctx:OxV) WARNING: suspect code indent for conditional statements (8, 12) ERROR: spaces required around that '=' (ctx:VxV) ERROR: do not initialise statics to 0 or NULL ERROR: space prohibited after that open parenthesis '(' X 2 ERROR: space prohibited before that close parenthesis ')' X 2 ERROR: trailing statements should be on next line ERROR: return is not a function, parentheses are not required Also use pr_debug and pr_warning where possible. arch/x86/kernel/cpu/mtrr/generic.o: text data bss dec hex filename 5652 77 4224 9953 26e1 generic.o.before 5652 77 4220 9949 26dd generic.o.after The md5 changed: b34d6c045f06daa4ed092b90cc760e8f generic.o.before.asm a490c6251cfd8442fbffecc0e09a573d generic.o.after.asm Because mtrr_state moved from data to bss, changing its offsets - and also because __LINE__ numbers changed. Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ Further cleanups to make the code more consistent ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 304 +++++++++++++++++++++---------------- 1 file changed, 169 insertions(+), 135 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0543f69f0b2..55da0c5f68d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,28 +1,34 @@ -/* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - because MTRRs can span upto 40 bits (36bits on most modern x86) */ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span upto 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include #include #include +#include #include -#include -#include -#include -#include -#include -#include + #include +#include #include +#include +#include +#include #include + #include "mtrr.h" struct fixed_range_block { - int base_msr; /* start address of an MTRR block */ - int ranges; /* number of MTRRs in this block */ + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ }; static struct fixed_range_block fixed_range_blocks[] = { - { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -30,10 +36,10 @@ static unsigned long smp_changes_mask; static int mtrr_state_set; u64 mtrr_tom2; -struct mtrr_state_type mtrr_state = {}; +struct mtrr_state_type mtrr_state; EXPORT_SYMBOL_GPL(mtrr_state); -/** +/* * BIOS is expected to clear MtrrFixDramModEn bit, see for example * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD * Opteron Processors" (26094 Rev. 3.30 February 2006), section @@ -104,9 +110,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) * Look of multiple ranges matching this address and pick type * as per MTRR precedence */ - if (!(mtrr_state.enabled & 2)) { + if (!(mtrr_state.enabled & 2)) return mtrr_state.def_type; - } prev_match = 0xFF; for (i = 0; i < num_var_ranges; ++i) { @@ -125,9 +130,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) if (start_state != end_state) return 0xFE; - if ((start & mask) != (base & mask)) { + if ((start & mask) != (base & mask)) continue; - } curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; if (prev_match == 0xFF) { @@ -148,9 +152,8 @@ u8 mtrr_type_lookup(u64 start, u64 end) curr_match = MTRR_TYPE_WRTHROUGH; } - if (prev_match != curr_match) { + if (prev_match != curr_match) return MTRR_TYPE_UNCACHABLE; - } } if (mtrr_tom2) { @@ -164,7 +167,7 @@ u8 mtrr_type_lookup(u64 start, u64 end) return mtrr_state.def_type; } -/* Get the MSR pair relating to a var range */ +/* Get the MSR pair relating to a var range */ static void get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) { @@ -172,7 +175,7 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); } -/* fill the MSR pair relating to a var range */ +/* Fill the MSR pair relating to a var range */ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) { @@ -186,10 +189,9 @@ void fill_mtrr_var_range(unsigned int index, vr[index].mask_hi = mask_hi; } -static void -get_fixed_ranges(mtrr_type * frs) +static void get_fixed_ranges(mtrr_type *frs) { - unsigned int *p = (unsigned int *) frs; + unsigned int *p = (unsigned int *)frs; int i; k8_check_syscfg_dram_mod_en(); @@ -217,22 +219,22 @@ static void __init print_fixed_last(void) if (!last_fixed_end) return; - printk(KERN_DEBUG " %05X-%05X %s\n", last_fixed_start, - last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); last_fixed_end = 0; } static void __init update_fixed_last(unsigned base, unsigned end, - mtrr_type type) + mtrr_type type) { last_fixed_start = base; last_fixed_end = end; last_fixed_type = type; } -static void __init print_fixed(unsigned base, unsigned step, - const mtrr_type *types) +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) { unsigned i; @@ -259,54 +261,55 @@ static void __init print_mtrr_state(void) unsigned int i; int high_width; - printk(KERN_DEBUG "MTRR default type: %s\n", - mtrr_attrib_to_str(mtrr_state.def_type)); + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - printk(KERN_DEBUG "MTRR fixed ranges %sabled:\n", - mtrr_state.enabled & 1 ? "en" : "dis"); + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) - print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8); + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); for (i = 0; i < 8; ++i) - print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8); + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); /* tail */ print_fixed_last(); } - printk(KERN_DEBUG "MTRR variable ranges %sabled:\n", - mtrr_state.enabled & 2 ? "en" : "dis"); + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); if (size_or_mask & 0xffffffffUL) high_width = ffs(size_or_mask & 0xffffffffUL) - 1; else high_width = ffs(size_or_mask>>32) + 32 - 1; high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) - printk(KERN_DEBUG " %u base %0*X%05X000 mask %0*X%05X000 %s\n", - i, - high_width, - mtrr_state.var_ranges[i].base_hi, - mtrr_state.var_ranges[i].base_lo >> 12, - high_width, - mtrr_state.var_ranges[i].mask_hi, - mtrr_state.var_ranges[i].mask_lo >> 12, - mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); else - printk(KERN_DEBUG " %u disabled\n", i); - } - if (mtrr_tom2) { - printk(KERN_DEBUG "TOM2: %016llx aka %lldM\n", - mtrr_tom2, mtrr_tom2>>20); + pr_debug(" %u disabled\n", i); } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); } -/* Grab all of the MTRR state for this CPU into *state */ +/* Grab all of the MTRR state for this CPU into *state */ void __init get_mtrr_state(void) { - unsigned int i; struct mtrr_var_range *vrs; - unsigned lo, dummy; unsigned long flags; + unsigned lo, dummy; + unsigned int i; vrs = mtrr_state.var_ranges; @@ -324,6 +327,7 @@ void __init get_mtrr_state(void) if (amd_special_default_mtrr()) { unsigned low, high; + /* TOP_MEM2 */ rdmsr(MSR_K8_TOP_MEM2, low, high); mtrr_tom2 = high; @@ -344,10 +348,9 @@ void __init get_mtrr_state(void) post_set(); local_irq_restore(flags); - } -/* Some BIOS's are fucked and don't set all MTRRs the same! */ +/* Some BIOS's are messed up and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { unsigned long mask = smp_changes_mask; @@ -355,28 +358,33 @@ void __init mtrr_state_warn(void) if (!mask) return; if (mask & MTRR_CHANGE_MASK_FIXED) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent variable MTRR settings\n"); + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk(KERN_WARNING "mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); printk(KERN_INFO "mtrr: corrected configuration.\n"); } -/* Doesn't attempt to pass an error out to MTRR users - because it's quite complicated in some cases and probably not - worth it because the best error handling is to ignore it. */ +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) { - if (wrmsr_safe(msr, a, b) < 0) + if (wrmsr_safe(msr, a, b) < 0) { printk(KERN_ERR "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", smp_processor_id(), msr, a, b); + } } /** - * set_fixed_range - checks & updates a fixed-range MTRR if it differs from the value it should have + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have * @msr: MSR address of the MTTR which should be checked and updated * @changed: pointer which indicates whether the MTRR needed to be changed * @msrwords: pointer to the MSR values which the MSR should have @@ -401,20 +409,23 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) * * Returns: The index of the region on success, else negative on error. */ -int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) { - int i, max; - mtrr_type ltype; unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; max = num_var_ranges; if (replace_reg >= 0 && replace_reg < max) return replace_reg; + for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lsize == 0) return i; } + return -ENOSPC; } @@ -434,7 +445,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { - /* Invalid (i.e. free) range */ + /* Invalid (i.e. free) range */ *base = 0; *size = 0; *type = 0; @@ -471,27 +482,31 @@ out_put_cpu: } /** - * set_fixed_ranges - checks & updates the fixed-range MTRRs if they differ from the saved set + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() */ -static int set_fixed_ranges(mtrr_type * frs) +static int set_fixed_ranges(mtrr_type *frs) { - unsigned long long *saved = (unsigned long long *) frs; + unsigned long long *saved = (unsigned long long *)frs; bool changed = false; - int block=-1, range; + int block = -1, range; k8_check_syscfg_dram_mod_en(); - while (fixed_range_blocks[++block].ranges) - for (range=0; range < fixed_range_blocks[block].ranges; range++) - set_fixed_range(fixed_range_blocks[block].base_msr + range, - &changed, (unsigned int *) saved++); + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } return changed; } -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; @@ -501,6 +516,7 @@ static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); changed = true; } @@ -526,21 +542,26 @@ static u32 deftype_lo, deftype_hi; */ static unsigned long set_mtrr_state(void) { - unsigned int i; unsigned long change_mask = 0; + unsigned int i; - for (i = 0; i < num_var_ranges; i++) + for (i = 0; i < num_var_ranges; i++) { if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ if ((deftype_lo & 0xff) != mtrr_state.def_type || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { - deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10); + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); change_mask |= MTRR_CHANGE_MASK_DEFTYPE; } @@ -548,33 +569,36 @@ static unsigned long set_mtrr_state(void) } -static unsigned long cr4 = 0; +static unsigned long cr4; static DEFINE_SPINLOCK(set_atomicity_lock); /* - * Since we are disabling the cache don't allow any interrupts - they - * would run extremely slow and would only increase the pain. The caller must - * ensure that local interrupts are disabled and are reenabled after post_set() - * has been called. + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. */ - static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ spin_lock(&set_atomicity_lock); - /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { cr4 = read_cr4(); write_cr4(cr4 & ~X86_CR4_PGE); } @@ -582,26 +606,26 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ __flush_tlb(); - /* Save MTRR state */ + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - /* Disable MTRRs, and set the default type to uncached */ + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) { - /* Flush TLBs (no need to flush caches - they are disabled) */ + /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); - - /* Enable caches */ + + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ - if ( cpu_has_pge ) + /* Restore value of CR4 */ + if (cpu_has_pge) write_cr4(cr4); spin_unlock(&set_atomicity_lock); } @@ -623,24 +647,27 @@ static void generic_set_all(void) post_set(); local_irq_restore(flags); - /* Use the atomic bitops to update the global mask */ + /* Use the atomic bitops to update the global mask */ for (count = 0; count < sizeof mask * 8; ++count) { if (mask & 0x01) set_bit(count, &smp_changes_mask); mask >>= 1; } - + } +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ static void generic_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - [RETURNS] Nothing. -*/ { unsigned long flags; struct mtrr_var_range *vr; @@ -651,8 +678,10 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, prepare_set(); if (size == 0) { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); memset(vr, 0, sizeof(struct mtrr_var_range)); } else { @@ -669,46 +698,50 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, local_irq_restore(flags); } -int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) { unsigned long lbase, last; - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 1 && boot_cpu_data.x86_mask <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); return -EINVAL; } if (!(base + size < 0x70000 || base > 0x7003F) && (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) { - printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); return -EINVAL; } } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ last = base + size - 1; for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1) ; + lbase = lbase >> 1, last = last >> 1) + ; if (lbase != last) { - printk(KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); return -EINVAL; } return 0; } - static int generic_have_wrcomb(void) { unsigned long config, dummy; rdmsr(MSR_MTRRcap, config, dummy); - return (config & (1 << 10)); + return config & (1 << 10); } int positive_have_wrcomb(void) @@ -716,14 +749,15 @@ int positive_have_wrcomb(void) return 1; } -/* generic structure... +/* + * Generic structure... */ struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .set_all = generic_set_all, - .get = generic_get_mtrr, - .get_free_region = generic_get_free_region, - .set = generic_set_mtrr, - .validate_add_page = generic_validate_add_page, - .have_wrcomb = generic_have_wrcomb, + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, }; -- cgit v1.2.3-70-g09d2 From 26dc67eda19beafb7e5ef2770cec5b3ee5995a8e Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:53:40 +0530 Subject: x86: Clean up mtrr/if.c Fix: WARNING: Use #include instead of ERROR: trailing whitespace X 7 ERROR: trailing statements should be on next line X 3 WARNING: line over 80 characters X 5 ERROR: space required before the open parenthesis '(' arch/x86/kernel/cpu/mtrr/if.o: text data bss dec hex filename 2239 4 0 2243 8c3 if.o.before 2239 4 0 2243 8c3 if.o.after md5: 78d1f2aa4843ec6509c18e2dee54bc7f if.o.before.asm 78d1f2aa4843ec6509c18e2dee54bc7f if.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More cleanups to make the code more consistent. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/if.c | 135 ++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index fb73a52913a..08b6ea4c62b 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -1,27 +1,28 @@ -#include -#include #include -#include -#include #include -#include +#include +#include +#include +#include +#include #define LINE_SIZE 80 #include + #include "mtrr.h" #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = { - "uncachable", /* 0 */ - "write-combining", /* 1 */ - "?", /* 2 */ - "?", /* 3 */ - "write-through", /* 4 */ - "write-protect", /* 5 */ - "write-back", /* 6 */ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ }; const char *mtrr_attrib_to_str(int x) @@ -35,8 +36,8 @@ static int mtrr_file_add(unsigned long base, unsigned long size, unsigned int type, bool increment, struct file *file, int page) { + unsigned int *fcount = FILE_FCOUNT(file); int reg, max; - unsigned int *fcount = FILE_FCOUNT(file); max = num_var_ranges; if (fcount == NULL) { @@ -61,8 +62,8 @@ static int mtrr_file_del(unsigned long base, unsigned long size, struct file *file, int page) { - int reg; unsigned int *fcount = FILE_FCOUNT(file); + int reg; if (!page) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) @@ -81,13 +82,14 @@ mtrr_file_del(unsigned long base, unsigned long size, return reg; } -/* RED-PEN: seq_file can seek now. this is ignored. */ +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ static ssize_t mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ { int i, err; unsigned long reg; @@ -100,15 +102,18 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EPERM; if (!len) return -EINVAL; + memset(line, 0, LINE_SIZE); if (len > LINE_SIZE) len = LINE_SIZE; if (copy_from_user(line, buf, len - 1)) return -EFAULT; + linelen = strlen(line); ptr = line + linelen - 1; if (linelen && *ptr == '\n') *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { reg = simple_strtoul(line + 8, &ptr, 0); err = mtrr_del_page(reg, 0, 0); @@ -116,28 +121,35 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return err; return len; } + if (strncmp(line, "base=", 5)) return -EINVAL; + base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "size=", 5)) return -EINVAL; + size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) ; + for (; isspace(*ptr); ++ptr) + ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page((unsigned long) base, (unsigned long) size, i, - true); + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); if (err < 0) return err; return len; @@ -181,7 +193,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) case MTRRIOC32_SET_PAGE_ENTRY: case MTRRIOC32_DEL_PAGE_ENTRY: case MTRRIOC32_KILL_PAGE_ENTRY: { - struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)__arg; + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; err = get_user(sentry.base, &s32->base); err |= get_user(sentry.size, &s32->size); err |= get_user(sentry.type, &s32->type); @@ -191,7 +205,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) } case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = get_user(gentry.regnum, &g32->regnum); err |= get_user(gentry.base, &g32->base); err |= get_user(gentry.size, &g32->size); @@ -314,7 +330,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (err) return err; - switch(cmd) { + switch (cmd) { case MTRRIOC_GET_ENTRY: case MTRRIOC_GET_PAGE_ENTRY: if (copy_to_user(arg, &gentry, sizeof gentry)) @@ -323,7 +339,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #ifdef CONFIG_COMPAT case MTRRIOC32_GET_ENTRY: case MTRRIOC32_GET_PAGE_ENTRY: { - struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)__arg; + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; err = put_user(gentry.base, &g32->base); err |= put_user(gentry.size, &g32->size); err |= put_user(gentry.regnum, &g32->regnum); @@ -335,11 +353,10 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) return err; } -static int -mtrr_close(struct inode *ino, struct file *file) +static int mtrr_close(struct inode *ino, struct file *file) { - int i, max; unsigned int *fcount = FILE_FCOUNT(file); + int i, max; if (fcount != NULL) { max = num_var_ranges; @@ -359,22 +376,22 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset); static int mtrr_open(struct inode *inode, struct file *file) { - if (!mtrr_if) + if (!mtrr_if) return -EIO; - if (!mtrr_if->get) - return -ENXIO; + if (!mtrr_if->get) + return -ENXIO; return single_open(file, mtrr_seq_show, NULL); } static const struct file_operations mtrr_fops = { - .owner = THIS_MODULE, - .open = mtrr_open, - .read = seq_read, - .llseek = seq_lseek, - .write = mtrr_write, - .unlocked_ioctl = mtrr_ioctl, - .compat_ioctl = mtrr_ioctl, - .release = mtrr_close, + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, }; static int mtrr_seq_show(struct seq_file *seq, void *offset) @@ -388,23 +405,24 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) max = num_var_ranges; for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); - if (size == 0) + if (size == 0) { mtrr_usage_table[i] = 0; - else { - if (size < (0x100000 >> PAGE_SHIFT)) { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } else { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - /* RED-PEN: base can be > 32bit */ - len += seq_printf(seq, - "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_usage_table[i], mtrr_attrib_to_str(type)); + continue; } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); } return 0; } @@ -422,6 +440,5 @@ static int __init mtrr_if_init(void) proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); return 0; } - arch_initcall(mtrr_if_init); #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3-70-g09d2 From 3ec8dbcb09bb6df83993ca03e88cb85e3aaa8edb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:54:16 +0530 Subject: x86: Clean up mtrr/mtrr.h Fix: ERROR: do not use C99 // comments ERROR: "foo * bar" should be "foo *bar" X 2 Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More tidyups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/mtrr.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7538b767f20..a501dee9a87 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -1,5 +1,5 @@ /* - * local mtrr defines. + * local MTRR defines. */ #include @@ -14,13 +14,12 @@ extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; u32 use_intel_if; -// void (*init)(void); void (*set)(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); void (*set_all)(void); void (*get)(unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type); + unsigned long *size, mtrr_type *type); int (*get_free_region)(unsigned long base, unsigned long size, int replace_reg); int (*validate_add_page)(unsigned long base, unsigned long size, @@ -39,11 +38,11 @@ extern int positive_have_wrcomb(void); /* library functions for processor-specific routines */ struct set_mtrr_context { - unsigned long flags; - unsigned long cr4val; - u32 deftype_lo; - u32 deftype_hi; - u32 ccr3; + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; }; void set_mtrr_done(struct set_mtrr_context *ctxt); @@ -54,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops * ops); +extern void set_mtrr_ops(struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops * mtrr_if; +extern struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) -- cgit v1.2.3-70-g09d2 From 09b22c85d59dd935fdfa71655a443785e3f99c18 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:54:53 +0530 Subject: x86: Clean up mtrr/state.c Fix: WARNING: Use #include instead of WARNING: line over 80 characters X 4 arch/x86/kernel/cpu/mtrr/state.o: text data bss dec hex filename 864 0 0 864 360 state.o.before 864 0 0 864 360 state.o.after md5: c5c4364b9aeac74d70111e1e49667a2c state.o.before.asm c5c4364b9aeac74d70111e1e49667a2c state.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> [ More cleanups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/state.c | 68 +++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 1f5fb1588d1..dfc80b4e6b0 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -1,24 +1,25 @@ -#include #include -#include -#include -#include +#include +#include + #include #include -#include "mtrr.h" +#include +#include +#include "mtrr.h" -/* Put the processor into a state where MTRRs can be safely set */ +/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; - /* Disable interrupts locally */ + /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { - /* Save value of CR4 and clear Page Global Enable (bit 7) */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); @@ -33,50 +34,61 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) write_cr0(cr0); wbinvd(); - if (use_intel()) - /* Save MTRR state */ + if (use_intel()) { + /* Save MTRR state */ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else were excluded at the top */ + } else { + /* + * Cyrix ARRs - + * everything else were excluded at the top + */ ctxt->ccr3 = getCx86(CX86_CCR3); + } } } void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { - if (use_intel()) - /* Disable MTRRs, and set the default type to uncached */ + if (use_intel()) { + /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - else if (is_cpu(CYRIX)) - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } else { + if (is_cpu(CYRIX)) { + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); + } + } } -/* Restore the processor after a set_mtrr_prepare */ +/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { - /* Flush caches and TLBs */ + /* Flush caches and TLBs */ wbinvd(); - /* Restore MTRRdefType */ - if (use_intel()) + /* Restore MTRRdefType */ + if (use_intel()) { /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - else - /* Cyrix ARRs - everything else was excluded at the top */ + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, + ctxt->deftype_hi); + } else { + /* + * Cyrix ARRs - + * everything else was excluded at the top + */ setCx86(CX86_CCR3, ctxt->ccr3); + } - /* Enable caches */ + /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); - /* Restore value of CR4 */ + /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(ctxt->cr4val); } - /* Re-enable interrupts locally (if enabled previously) */ + /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); } - -- cgit v1.2.3-70-g09d2 From dbd51be026eaf84088fdee7fab9f38fa92eef26d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 4 Jul 2009 07:56:28 +0530 Subject: x86: Clean up mtrr/main.c Fix following trivial style problems: ERROR: trailing whitespace X 25 WARNING: Use #include instead of WARNING: Use #include instead of ERROR: do not initialise externals to 0 or NULL X 2 ERROR: "foo * bar" should be "foo *bar" X 5 ERROR: do not use assignment in if condition X 2 WARNING: line over 80 characters X 8 ERROR: return is not a function, parentheses are not required WARNING: braces {} are not necessary for any arm of this statement ERROR: space required before the open parenthesis '(' X 2 ERROR: open brace '{' following function declarations go on the next line ERROR: space required after that ',' (ctx:VxV) X 8 ERROR: space required before the open parenthesis '(' X 3 ERROR: else should follow close brace '}' WARNING: space prohibited between function name and open parenthesis '(' WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable X 2 Also use pr_debug and pr_warning where possible. total: 50 errors, 14 warnings arch/x86/kernel/cpu/mtrr/main.o: text data bss dec hex filename 3668 116 4156 7940 1f04 main.o.before 3668 116 4156 7940 1f04 main.o.after md5: e01af2fd28deef77c8d01e71acfbd365 main.o.before.asm e01af2fd28deef77c8d01e71acfbd365 main.o.after.asm Suggested-by: Alan Cox Signed-off-by: Jaswinder Singh Rajput Cc: Andrew Morton Cc: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> Cc: Avi Kivity # Avi, please have a look at the kvm_para.h bit [ More cleanups ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 455 +++++++++++++++++++++------------------- 1 file changed, 242 insertions(+), 213 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8fc248b5aea..7af0f88a416 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -25,43 +25,48 @@ Operating System Writer's Guide" (Intel document number 242692), section 11.11.7 - This was cleaned and made readable by Patrick Mochel - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: System Programming Guide; Section 9.11. (1997 edition - PPro). */ +#define DEBUG + +#include /* FIXME: kvm_para.h needs this */ + +#include +#include #include +#include #include +#include +#include #include #include -#include -#include -#include +#include #include #include -#include -#include #include -#include + #include "mtrr.h" -u32 num_var_ranges = 0; +u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops * mtrr_if = NULL; +struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops * ops) +void set_mtrr_ops(struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; @@ -72,30 +77,36 @@ static int have_wrcomb(void) { struct pci_dev *dev; u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ if (dev->vendor == PCI_VENDOR_ID_INTEL && dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); pci_dev_put(dev); return 0; } pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; } /* This function returns the number of variable MTRRs */ @@ -103,12 +114,13 @@ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; - if (use_intel()) { + if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); - } else if (is_cpu(AMD)) + else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; + num_var_ranges = config & 0xff; } @@ -130,10 +142,12 @@ struct set_mtrr_data { mtrr_type smp_type; }; +/** + * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * + * Returns nothing. + */ static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; @@ -142,18 +156,19 @@ static void ipi_handler(void *info) local_irq_save(flags); atomic_dec(&data->count); - while(!atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - else + } else { mtrr_if->set_all(); + } atomic_dec(&data->count); - while(atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); @@ -161,7 +176,8 @@ static void ipi_handler(void *info) #endif } -static inline int types_compatible(mtrr_type type1, mtrr_type type2) { +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ return type1 == MTRR_TYPE_UNCACHABLE || type2 == MTRR_TYPE_UNCACHABLE || (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || @@ -176,10 +192,10 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * @type: mtrr type * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * + * * 1. Send IPI to do the following: * 2. Disable Interrupts - * 3. Wait for all procs to do so + * 3. Wait for all procs to do so * 4. Enter no-fill cache mode * 5. Flush caches * 6. Clear PGE bit @@ -189,26 +205,27 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) { * 10. Enable all range registers * 11. Flush all TLBs and caches again * 12. Enter normal cache mode and reenable caching - * 13. Set PGE + * 13. Set PGE * 14. Wait for buddies to catch up * 15. Enable interrupts. - * + * * What does that mean for us? Well, first we set data.count to the number * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. + * The CPU vendors may each do it differently, + * so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate + * to be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { struct set_mtrr_data data; unsigned long flags; @@ -218,121 +235,122 @@ static void set_mtrr(unsigned int reg, unsigned long base, data.smp_size = size; data.smp_type = type; atomic_set(&data.count, num_booting_cpus() - 1); - /* make sure data.count is visible before unleashing other CPUs */ + + /* Make sure data.count is visible before unleashing other CPUs */ smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); - /* Start the ball rolling on other CPUs */ + /* Start the ball rolling on other CPUs */ if (smp_call_function(ipi_handler, &data, 0) != 0) panic("mtrr: timed out waiting for other CPUs\n"); local_irq_save(flags); - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); - /* ok, reset count and toggle gate */ + /* Ok, reset count and toggle gate */ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,1); + atomic_set(&data.gate, 1); - /* do our MTRR business */ + /* Do our MTRR business */ - /* HACK! + /* + * HACK! * We use this same function to initialize the mtrrs on boot. * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. + * to replicate across all the APs. * If we're doing that @reg is set to something special... */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); + if (reg != ~0U) + mtrr_if->set(reg, base, size, type); - /* wait for the others */ - while(atomic_read(&data.count)) + /* Wait for the others */ + while (atomic_read(&data.count)) cpu_relax(); atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate,0); + atomic_set(&data.gate, 0); /* * Wait here for everyone to have seen the gate change * So we're the last ones to touch 'data' */ - while(atomic_read(&data.count)) + while (atomic_read(&data.count)) cpu_relax(); local_irq_restore(flags); } /** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (in units of 4 kB!) - * @size: Physical size of region in pages (4 kB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int mtrr_add_page(unsigned long base, unsigned long size, +int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, bool increment) { + unsigned long lbase, lsize; int i, replace, error; mtrr_type ltype; - unsigned long lbase, lsize; if (!mtrr_if) return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + + error = mtrr_if->validate_add_page(base, size, type); + if (error) return error; if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); + pr_warning("mtrr: type: %u invalid\n", type); return -EINVAL; } - /* If the type is WC, check that this processor supports it */ + /* If the type is WC, check that this processor supports it */ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); + pr_warning("mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; } if (!size) { - printk(KERN_WARNING "mtrr: zero sized request\n"); + pr_warning("mtrr: zero sized request\n"); return -EINVAL; } if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); + pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -341,36 +359,40 @@ int mtrr_add_page(unsigned long base, unsigned long size, /* No CPU hotplug when we change MTRR entries */ get_online_cpus(); - /* Search for existing MTRR */ + + /* Search for existing MTRR */ mutex_lock(&mtrr_mutex); for (i = 0; i < num_var_ranges; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); - if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase) + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) continue; - /* At this point we know there is some kind of overlap/enclosure */ + /* + * At this point we know there is some kind of + * overlap/enclosure + */ if (base < lbase || base + size - 1 > lbase + lsize - 1) { - if (base <= lbase && base + size - 1 >= lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { /* New region encloses an existing region */ if (type == ltype) { replace = replace == -1 ? i : -2; continue; - } - else if (types_compatible(type, ltype)) + } else if (types_compatible(type, ltype)) continue; } - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); goto out; } - /* New region is enclosed by an existing region */ + /* New region is enclosed by an existing region */ if (ltype != type) { if (types_compatible(type, ltype)) continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); goto out; } if (increment) @@ -378,7 +400,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; goto out; } - /* Search for an empty MTRR */ + /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); @@ -393,8 +415,9 @@ int mtrr_add_page(unsigned long base, unsigned long size, mtrr_usage_table[replace] = 0; } } - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); + } else { + pr_info("mtrr: no more MTRRs available\n"); + } error = i; out: mutex_unlock(&mtrr_mutex); @@ -405,10 +428,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, static int mtrr_check(unsigned long base, unsigned long size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING - "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG - "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); dump_stack(); return -1; } @@ -416,66 +437,64 @@ static int mtrr_check(unsigned long base, unsigned long size) } /** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. * - * The available types are + * The available types are * - * %MTRR_TYPE_UNCACHABLE - No caching + * %MTRR_TYPE_UNCACHABLE - No caching * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. */ - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - bool increment) +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } +EXPORT_SYMBOL(mtrr_add); /** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -500,22 +519,22 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) } } if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); goto out; } } if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); + pr_warning("mtrr: register: %d too big\n", reg); goto out; } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); + pr_warning("mtrr: MTRR %d not used\n", reg); goto out; } if (mtrr_usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + pr_warning("mtrr: reg: %d has count=0\n", reg); goto out; } if (--mtrr_usage_table[reg] < 1) @@ -526,33 +545,31 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) put_online_cpus(); return error; } + /** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. + * If register is supplied then base and size are ignored. This is + * how drivers should call it. * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. */ - -int -mtrr_del(int reg, unsigned long base, unsigned long size) +int mtrr_del(int reg, unsigned long base, unsigned long size) { if (mtrr_check(base, size)) return -EINVAL; return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } - -EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! +/* + * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ @@ -576,29 +593,28 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device * sysdev, pm_message_t state) +static int mtrr_save(struct sys_device *sysdev, pm_message_t state) { int i; for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_value[i].lbase, - &mtrr_value[i].lsize, - &mtrr_value[i].ltype); + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); } return 0; } -static int mtrr_restore(struct sys_device * sysdev) +static int mtrr_restore(struct sys_device *sysdev) { int i; for (i = 0; i < num_var_ranges; i++) { - if (mtrr_value[i].lsize) - set_mtrr(i, - mtrr_value[i].lbase, - mtrr_value[i].lsize, - mtrr_value[i].ltype); + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } } return 0; } @@ -615,26 +631,29 @@ int __initdata changed_by_mtrr_cleanup; /** * mtrr_bp_init - initialize mtrrs on the boot CPU * - * This needs to be called early; before any of the other CPUs are + * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). - * + * */ void __init mtrr_bp_init(void) { u32 phys_addr; + init_ifs(); phys_addr = 32; if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = 0xff000000; /* 36 bits */ size_and_mask = 0x00f00000; phys_addr = 36; - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it to when they extend the address + * bus of the Xeon. + */ if (cpuid_eax(0x80000000) >= 0x80000008) { phys_addr = cpuid_eax(0x80000008) & 0xff; /* CPUID workaround for Intel 0F33/0F34 CPU */ @@ -649,9 +668,11 @@ void __init mtrr_bp_init(void) size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = 0xfff00000; /* 32 bits */ size_and_mask = 0; phys_addr = 32; } @@ -694,7 +715,6 @@ void __init mtrr_bp_init(void) changed_by_mtrr_cleanup = 1; mtrr_if->set_all(); } - } } } @@ -706,12 +726,17 @@ void mtrr_ap_init(void) if (!mtrr_if || !use_intel()) return; /* - * Ideally we should hold mtrr_mutex here to avoid mtrr entries changed, - * but this routine will be called in cpu boot time, holding the lock - * breaks it. This routine is called in two cases: 1.very earily time - * of software resume, when there absolutely isn't mtrr entry changes; - * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to - * prevent mtrr entry changes + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes */ local_irq_save(flags); @@ -732,19 +757,23 @@ static int __init mtrr_init_finialize(void) { if (!mtrr_if) return 0; + if (use_intel()) { if (!changed_by_mtrr_cleanup) mtrr_state_warn(); - } else { - /* The CPUs haven't MTRR and seem to not support SMP. They have - * specific drivers, we use a tricky method to support - * suspend/resume for them. - * TBD: is there any system with such CPU which supports - * suspend/resume? if no, we should remove the code. - */ - sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + return 0; } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + return 0; } subsys_initcall(mtrr_init_finialize); -- cgit v1.2.3-70-g09d2 From e3d0e69268dffb9676bf0800a60fb3573a723480 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Jul 2009 09:44:11 +0200 Subject: x86: Further clean up of mtrr/generic.c Yinghai noticed that i defined BIOS_BUG_MSG but added no usage for it. The usage is to clean up this turd in generic.c: printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " "contains strange UC entry under 1M, check " "with your system vendor!\n", i); Breaking printk lines in the middle looks ugly, is hard to read and breaks 'git grep'. Use the BIOS_BUG_MSG instead. Also complete the moving of structure definitions and variables to the top of the file. Reported-by: Yinghai Lu LKML-Reference: <20090703164225.GA21447@elte.hu> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index b8aba811b60..315738c74aa 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -34,14 +34,37 @@ #include "mtrr.h" -/* Should be related to MTRR_VAR_RANGES nums */ -#define RANGE_NUM 256 - struct res_range { unsigned long start; unsigned long end; }; +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct res_range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + + static int __init add_range(struct res_range *range, int nr_range, unsigned long start, unsigned long end) @@ -147,18 +170,6 @@ static int __init cmp_range(const void *x1, const void *x2) return start1 - start2; } -struct var_mtrr_range_state { - unsigned long base_pfn; - unsigned long size_pfn; - mtrr_type type; -}; - -static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; - -static int __initdata debug_print; -#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" @@ -200,9 +211,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && (mtrr_state.enabled & 1)) { /* Var MTRR contains UC entry below 1M? Skip it: */ - printk(KERN_WARNING "WARNING: BIOS bug: VAR MTRR %d " - "contains strange UC entry under 1M, check " - "with your system vendor!\n", i); + printk(BIOS_BUG_MSG, i); if (base + size <= (1<<(20-PAGE_SHIFT))) continue; size -= (1<<(20-PAGE_SHIFT)) - base; @@ -244,9 +253,6 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, return nr_range; } -static struct res_range __initdata range[RANGE_NUM]; -static int __initdata nr_range; - #ifdef CONFIG_MTRR_SANITIZER static unsigned long __init sum_ranges(struct res_range *range, int nr_range) @@ -284,14 +290,6 @@ static int __init mtrr_cleanup_debug_setup(char *str) } early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); -struct var_mtrr_state { - unsigned long range_startk; - unsigned long range_sizek; - unsigned long chunk_sizek; - unsigned long gran_sizek; - unsigned int reg; -}; - static void __init set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, unsigned char type, unsigned int address_bits) -- cgit v1.2.3-70-g09d2 From 023bf6f1b8bf58dc4da7f0dc1cf4787b0d5297c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2009 11:27:40 +0900 Subject: linker script: unify usage of discard definition Discarded sections in different archs share some commonality but have considerable differences. This led to linker script for each arch implementing its own /DISCARD/ definition, which makes maintaining tedious and adding new entries error-prone. This patch makes all linker scripts to move discard definitions to the end of the linker script and use the common DISCARDS macro. As ld uses the first matching section definition, archs can include default discarded sections by including them earlier in the linker script. ia64 is notable because it first throws away some ia64 specific subsections and then include the rest of the sections into the final image, so those sections must be discarded before the inclusion. defconfig compile tested for x86, x86-64, powerpc, powerpc64, ia64, alpha, sparc, sparc64 and s390. Michal Simek tested microblaze. Signed-off-by: Tejun Heo Acked-by: Paul Mundt Acked-by: Mike Frysinger Tested-by: Michal Simek Cc: linux-arch@vger.kernel.org Cc: Michal Simek Cc: microblaze-uclinux@itee.uq.edu.au Cc: Sam Ravnborg Cc: Tony Luck --- arch/alpha/kernel/vmlinux.lds.S | 10 ++-------- arch/avr32/kernel/vmlinux.lds.S | 10 +++------- arch/blackfin/kernel/vmlinux.lds.S | 6 +----- arch/cris/kernel/vmlinux.lds.S | 10 ++-------- arch/frv/kernel/vmlinux.lds.S | 2 +- arch/h8300/kernel/vmlinux.lds.S | 6 ++---- arch/ia64/kernel/vmlinux.lds.S | 17 ++++++++--------- arch/m32r/kernel/vmlinux.lds.S | 11 +++-------- arch/m68k/kernel/vmlinux-std.lds | 11 +++-------- arch/m68k/kernel/vmlinux-sun3.lds | 10 ++-------- arch/m68knommu/kernel/vmlinux.lds.S | 8 +------- arch/microblaze/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 22 ++++++++++------------ arch/mn10300/kernel/vmlinux.lds.S | 9 +++------ arch/parisc/kernel/vmlinux.lds.S | 9 ++++----- arch/powerpc/kernel/vmlinux.lds.S | 10 +++------- arch/s390/kernel/vmlinux.lds.S | 10 +++------- arch/sh/kernel/vmlinux.lds.S | 11 ++++------- arch/sparc/kernel/vmlinux.lds.S | 9 ++------- arch/um/include/asm/common.lds.S | 5 ----- arch/um/kernel/dyn.lds.S | 2 +- arch/um/kernel/uml.lds.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 11 ++++------- arch/xtensa/kernel/vmlinux.lds.S | 14 ++++---------- include/asm-generic/vmlinux.lds.h | 18 ++++++++++++------ 25 files changed, 80 insertions(+), 155 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 75fe1d6877e..6dc03c35caa 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -134,14 +134,6 @@ SECTIONS __bss_stop = .; _end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .mdebug 0 : { *(.mdebug) } @@ -151,4 +143,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index b8324608ec0..c4b56654349 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -124,15 +124,11 @@ SECTIONS _end = .; } + DWARF_DEBUG + /* When something in the kernel is NOT compiled as a module, the module * cleanup code and data are put into these segments. Both can then be * thrown away, as cleanup code is never called unless it's a module. */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - - DWARF_DEBUG + DISCARDS } diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 6e8eabd8f0a..d7ffe299b97 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -277,9 +277,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : - { - *(.exitcall.exit) - *(.discard) - } + DISCARDS } diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index a3175ebb38c..6c81836b922 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -140,13 +140,7 @@ SECTIONS _end = .; __end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; + + DISCARDS } diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 64b5a5e4d35..7dbf41f68b5 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -178,7 +178,7 @@ SECTIONS .comment 0 : { *(.comment) } - /DISCARD/ : { *(.discard) } + DISCARDS } __kernel_image_size_no_bss = __bss_start - __kernel_image_start; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 03d6c0df33d..662b02ecb86 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -152,10 +152,6 @@ SECTIONS __end = . ; __ramstart = .; } - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } .romfs : { *(.romfs*) @@ -166,4 +162,6 @@ SECTIONS COMMAND_START = . - 0x200 ; __ramend = . ; } + + DISCARDS } diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 13d95897587..eb4214d1c5a 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -24,15 +24,14 @@ PHDRS { } SECTIONS { - /* Sections to be discarded */ + /* unwind exit sections must be discarded before the rest of the + sections get included. */ /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) *(.IA_64.unwind.exit.text) *(.IA_64.unwind_info.exit.text) - } + *(.comment) + *(.note) + } v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ phys_start = _start - LOAD_OFFSET; @@ -317,7 +316,7 @@ SECTIONS .debug_funcnames 0 : { *(.debug_funcnames) } .debug_typenames 0 : { *(.debug_typenames) } .debug_varnames 0 : { *(.debug_varnames) } - /* These must appear regardless of . */ - /DISCARD/ : { *(.comment) } - /DISCARD/ : { *(.note) } + + /* Default discards */ + DISCARDS } diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 480a49944cf..de5e21cca6a 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -120,14 +120,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -136,4 +128,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 905a797ada9..47eac19e8f6 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -82,14 +82,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -98,4 +90,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 47d04be322a..03efaf04d7d 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -77,14 +77,6 @@ __init_begin = .; _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .crap : { /* Stabs debugging sections. */ *(.stab) @@ -97,4 +89,6 @@ __init_begin = .; *(.note) } + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 68111a61a77..2736a5e309c 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -184,13 +184,6 @@ SECTIONS { __init_end = .; } > INIT - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .bss : { . = ALIGN(4); _sbss = . ; @@ -201,5 +194,6 @@ SECTIONS { _end = . ; } > BSS + DISCARDS } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index 81bebdcb18f..ec5fa91a48d 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -163,5 +163,5 @@ SECTIONS { . = ALIGN(4096); _end = .; - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 45901609b74..1474c18fb77 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -176,18 +176,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - - /* ABI crap starts here */ - *(.MIPS.options) - *(.options) - *(.pdr) - *(.reginfo) - } - /* These mark the ABI of the kernel for debuggers. */ .mdebug.abi32 : { KEEP(*(.mdebug.abi32)) @@ -213,4 +201,14 @@ SECTIONS *(.gptab.bss) *(.gptab.sbss) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + /* ABI crap starts here */ + *(.MIPS.options) + *(.options) + *(.pdr) + *(.reginfo) + } } diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 5609d4962a5..8fcd0f1e21d 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -115,13 +115,10 @@ SECTIONS . = ALIGN(PAGE_SIZE); pg0 = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_CALL - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index ccf58341845..aea1784edbd 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -237,10 +237,12 @@ SECTIONS /* freed after init ends here */ _end = . ; + STABS_DEBUG + .note 0 : { *(.note) } + /* Sections to be discarded */ + DISCARDS /DISCARD/ : { - *(.exitcall.exit) - *(.discard) #ifdef CONFIG_64BIT /* temporary hack until binutils is fixed to not emit these * for static binaries @@ -253,7 +255,4 @@ SECTIONS *(.gnu.hash) #endif } - - STABS_DEBUG - .note 0 : { *(.note) } } diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7fca9355fd3..244e3658983 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -37,13 +37,6 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - /* Sections to be discarded. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - EXIT_DATA - } - . = KERNELBASE; /* @@ -299,4 +292,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); _end = . ; PROVIDE32 (end = .); + + /* Sections to be discarded. */ + DISCARDS } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 98867dfea46..82415c75b99 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -157,14 +157,10 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Debugging sections. */ STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 766976d27b2..0ce254bca92 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -163,17 +163,14 @@ SECTIONS _end = . ; } + STABS_DEBUG + DWARF_DEBUG + /* * When something in the kernel is NOT compiled as a module, the * module cleanup code and data are put into these segments. Both * can then be thrown away, as cleanup code is never called unless * it's a module. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - DWARF_DEBUG + DISCARDS } diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d63cf914667..866390feb68 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -171,13 +171,8 @@ SECTIONS } _end = . ; - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index cb0248616d4..37ecc5577a9 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -123,8 +123,3 @@ __initramfs_end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - } - diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 2916d6eadff..715a188c047 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -157,5 +157,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 1f8a622cabe..2ebd39765db 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -101,5 +101,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e8788204..b600c843710 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -387,15 +387,12 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index b1e24638acd..921b6ff3b64 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -280,16 +280,6 @@ SECTIONS *(.ResetVector.text) } - /* Sections to be discarded */ - /DISCARD/ : - { - *(.exit.literal) - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .xt.lit : { *(.xt.lit) } .xt.prop : { *(.xt.prop) } @@ -322,4 +312,8 @@ SECTIONS *(.xt.lit) *(.gnu.linkonce.p*) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.exit.literal) } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c5c18ac878a..ab8ea9b7741 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -35,13 +35,10 @@ * __bss_stop = .; * _end = .; * - * /DISCARD/ : { - * EXIT_TEXT - * EXIT_DATA - * EXIT_CALL - * } * STABS_DEBUG * DWARF_DEBUG + * + * DISCARDS // must be the last * } * * [__init_begin, __init_end] is the init section that may be freed after init @@ -629,11 +626,20 @@ #define INIT_RAM_FS #endif +/* + * Default discarded sections. + * + * Some archs want to discard exit text/data at runtime rather than + * link time due to cross-section references such as alt instructions, + * bug table, eh_frame, etc. DISCARDS must be the last of output + * section definitions so that such archs put those in earlier section + * definitions. + */ #define DISCARDS \ /DISCARD/ : { \ EXIT_TEXT \ EXIT_DATA \ - *(.exitcall.exit) \ + EXIT_CALL \ *(.discard) \ } -- cgit v1.2.3-70-g09d2 From c31d96338a6041520ba5f1b6a4a5012ef00686b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:37 +0200 Subject: x86: mce: Make CONFIG_X86_ANCIENT_MCE dependent on CONFIG_X86_MCE Add a missing depency for ANCIENT_MCE. It didn't matter in practice because the ANCIENT code wasn't compiled without X86_MCE, but it's better to express that clearly in Kconfig. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 356d2ec8e2f..5962b872a7a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -823,7 +823,7 @@ config X86_MCE_AMD config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip -- cgit v1.2.3-70-g09d2 From bab9bc6583fe6c1660d6ed36dd14bbb4edfaf393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:38 +0200 Subject: x86: mce: Update X86_MCE description in x86/Kconfig - Clarify that this config controls thermal throttling reporting too - Clarify the types of errors reported by machine checks - Drop references to ancient CPUs. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5962b872a7a..134a8c0d80d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -774,20 +774,12 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. + ranging from warning messages to halting the machine. config X86_OLD_MCE depends on X86_32 && X86_MCE -- cgit v1.2.3-70-g09d2 From 5bb38adcb54cf7192b154368ad62982caa11ca0b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:39 +0200 Subject: x86: mce: Remove old i386 machine check code As announced in feature-remove-schedule.txt remove CONFIG_X86_OLD_MCE This patch only removes code. The ancient machine check code for very old systems that are not supported by CONFIG_X86_NEW_MCE is still kept. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- Documentation/feature-removal-schedule.txt | 10 -- arch/x86/Kconfig | 35 +------ arch/x86/include/asm/mce.h | 11 -- arch/x86/kernel/cpu/mcheck/Makefile | 2 - arch/x86/kernel/cpu/mcheck/k7.c | 116 -------------------- arch/x86/kernel/cpu/mcheck/mce.c | 47 --------- arch/x86/kernel/cpu/mcheck/non-fatal.c | 94 ----------------- arch/x86/kernel/cpu/mcheck/p4.c | 163 ----------------------------- arch/x86/kernel/cpu/mcheck/p6.c | 127 ---------------------- 9 files changed, 2 insertions(+), 603 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/k7.c delete mode 100644 arch/x86/kernel/cpu/mcheck/non-fatal.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p4.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p6.c (limited to 'arch/x86') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7129846a278..edb2f0b0761 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -444,13 +444,3 @@ What: CONFIG_RFKILL_INPUT When: 2.6.33 Why: Should be implemented in userspace, policy daemon. Who: Johannes Berg - ----------------------------- - -What: CONFIG_X86_OLD_MCE -When: 2.6.32 -Why: Remove the old legacy 32bit machine check code. This has been - superseded by the newer machine check code from the 64bit port, - but the old version has been kept around for easier testing. Note this - doesn't impact the old P5 and WinChip machine check handlers. -Who: Andi Kleen diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 134a8c0d80d..d986769a7d9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,21 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - config X86_NEW_MCE depends on X86_MCE bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + default y config X86_MCE_INTEL def_bool y @@ -835,29 +824,9 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b50b9e9042c..6b8a974e127 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c); static inline void mcheck_init(struct cpuinfo_x86 *c) {} #endif -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); @@ -208,11 +201,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); -#else -static inline void mce_log_therm_throt_event(__u64 status) {} -#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2..022a036ce21 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,9 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc60..00000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7da8fec9ca8..5ff6362ecb1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,8 +58,6 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -1993,51 +1991,6 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); -} - -static int __init mcheck_enable(char *str) -{ - mce_p5_enabled = 1; - return 1; -} -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_OLD_MCE */ - /* * Old style boot options parsing. Only for compatibility. */ diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb..00000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include -#include -#include -#include - -#include -#include -#include - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f817818..00000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} -- cgit v1.2.3-70-g09d2 From c1ebf835617035b1f08f734247dcb981e17aac6b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:41 +0200 Subject: x86: mce: Rename CONFIG_X86_NEW_MCE to CONFIG_X86_MCE Drop the CONFIG_X86_NEW_MCE symbol and change all references to it to check for CONFIG_X86_MCE directly. No code changes Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 +++-------- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 +-- arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/signal.c | 2 +- 7 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d986769a7d9..06880ca677f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,15 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_NEW_MCE - depends on X86_MCE - bool - default y - config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -797,7 +792,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -817,7 +812,7 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa0785..5e3f2044f0d 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..f4227289caf 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 022a036ce21..4ac6d48fe11 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,5 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f..74656d1d4e3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2..8a194ad357e 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -190,7 +190,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94..cc26ad4c307 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); -- cgit v1.2.3-70-g09d2 From 9eda8cb3ac235217e4ffa01cb9cedee1c1550599 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:42 +0200 Subject: x86: mce: Move code in mce.c Now that the X86_OLD_MCE ifdefs are gone move some code that used to be outside the big ifdef to a more natural place near its user. No code change. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5ff6362ecb1..e16271f01ac 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,17 +45,6 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -1322,6 +1311,17 @@ static void mce_init_timer(void) add_timer(t); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: -- cgit v1.2.3-70-g09d2 From cebe182033f156b430952370fb0f9dbe6e89b081 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:43 +0200 Subject: x86: mce: Move per bank data in a single datastructure This addresses one of the leftover review comments. Move the per bank data into a single structure. This avoids several separate variables and also separate allocation of sysfs objects. I didn't move the CMCI ownership information so far because that would have needed some non trivial changes in the algorithms. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 14 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 109 +++++++++++++++--------------- 2 files changed, 67 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e..6bd51e7ba87 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include #include enum severity_level { @@ -10,6 +11,19 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e16271f01ac..a04806e01a8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -74,13 +73,13 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -91,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -482,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -903,7 +897,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; @@ -1146,6 +1140,21 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ @@ -1169,11 +1178,10 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1205,9 +1213,10 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -1223,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1237,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1250,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1578,7 +1587,8 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } return 0; @@ -1654,14 +1664,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1672,7 +1683,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1827,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1836,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1857,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,7 +1874,8 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } } @@ -1879,8 +1891,9 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); } } @@ -1928,35 +1941,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +1968,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) -- cgit v1.2.3-70-g09d2 From a2d32bcbc008aa0f9c301a7c6f3494cb23e6af54 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:44 +0200 Subject: x86: mce: macros to compute banks MSRs Instead of open coded calculations for bank MSRs hide the indexing of higher banks MCE register MSRs in new macros. No semantic changes. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 +++++++ arch/x86/kernel/cpu/mcheck/mce.c | 34 +++++++++++++++++----------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 +++++----- 3 files changed, 29 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e..3d1ce094586 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a04806e01a8..07139a0578e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -267,11 +267,11 @@ static int msr_to_offset(u32 msr) unsigned bank = __get_cpu_var(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -485,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -500,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -518,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -539,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -823,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -904,7 +904,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -945,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1216,8 +1216,8 @@ static void mce_init(void) struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -1589,7 +1589,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1876,7 +1876,7 @@ static void mce_disable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1893,7 +1893,7 @@ static void mce_reenable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a3..889f665fe93 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); -- cgit v1.2.3-70-g09d2 From 3ccdccfadbd2548abe38682b587f4ba27eac2fc9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:45 +0200 Subject: x86: mce: Lower maximum number of banks to architecture limit The Intel x86 architecture right now only supports 32 machine check banks, more would bump into other MSRs. So lower the max define to 32. This only affects a few bitmaps, most data structures are dynamically sized anyways. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6b8a974e127..ad753537291 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -130,10 +130,11 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; -- cgit v1.2.3-70-g09d2 From 9ff80942992cd5abd0779c815f310f65b7b83860 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 8 Jul 2009 22:03:53 +0400 Subject: x86: Clean up idt_descr and idt_tableby using NR_VECTORS instead of hardcoded number Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090708180353.GH5301@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/traps.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..d6f27c92854 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -982,7 +982,7 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475..7e4b1f5dec8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -76,7 +76,7 @@ char ignore_fpu_irq; * F0 0F bug workaround.. We have a special link segment * for this. */ -gate_desc idt_table[256] +gate_desc idt_table[NR_VECTORS] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; #endif -- cgit v1.2.3-70-g09d2 From a1b4f1a5b7f57be2593a9f1fca465a529c95fc07 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 5 Jul 2009 20:01:54 +0400 Subject: x86, ipi: Clean up safe_smp_processor_id() by using the cpu_has_apic() macro helper We already use a lot of cpu_has_ helpers. Lets do here the same for consistency. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090705160154.GB4791@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/ipi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index dbf5445727a..e6b4f517fcf 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -150,7 +150,7 @@ int safe_smp_processor_id(void) { int apicid, cpuid; - if (!boot_cpu_has(X86_FEATURE_APIC)) + if (!cpu_has_apic) return 0; apicid = hard_smp_processor_id(); -- cgit v1.2.3-70-g09d2 From e90476d3bab4322070c0afb3e3b55671de8664ea Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 11 Jul 2009 09:32:46 +0800 Subject: x86: Remove duplicated #include Remove duplicated #include in: arch/x86/kernel/dumpstack.c Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c8405718a4c..2d8a371d433 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -15,7 +15,6 @@ #include #include #include -#include #include -- cgit v1.2.3-70-g09d2 From 8bdbd962ecfcbdd96f9dbb02d780b4553afd2543 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Sat, 4 Jul 2009 00:35:45 +0100 Subject: x86/cpu: Clean up various files a bit No code changes except printk levels (although some of the K6 mtrr code might be clearer if there were a few as would splitting out some of the intel cache code). Signed-off-by: Alan Cox LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 37 ++++++----- arch/x86/kernel/cpu/bugs.c | 10 +-- arch/x86/kernel/cpu/bugs_64.c | 2 +- arch/x86/kernel/cpu/common.c | 8 +-- arch/x86/kernel/cpu/cyrix.c | 19 ++++-- arch/x86/kernel/cpu/hypervisor.c | 5 +- arch/x86/kernel/cpu/intel.c | 11 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 116 +++++++++++++++++---------------- arch/x86/kernel/cpu/perfctr-watchdog.c | 45 ++++++------- arch/x86/kernel/cpu/proc.c | 2 +- arch/x86/kernel/cpu/vmware.c | 18 ++--- 11 files changed, 144 insertions(+), 129 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28e5f595604..c6eb02e6987 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -45,8 +45,8 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) #define CBAR_ENB (0x80000000) #define CBAR_KEY (0X000000CB) if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); } } @@ -87,9 +87,10 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) d = d2-d; if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); else - printk("probably OK (after B9730xxxx).\n"); + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } @@ -219,8 +220,9 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { rdmsr(MSR_K7_CLK_CTL, l, h); if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); } } @@ -398,7 +400,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u32 level; level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } if (c->x86 == 0x10 || c->x86 == 0x11) @@ -487,27 +489,30 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < + ((tseg>>PMD_SHIFT) < (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); } } #endif } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, + unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) size = 64; + /* Tbird rev A1/A2 */ if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + (c->x86_mask == 0 || c->x86_mask == 1)) size = 256; } return size; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c8e315f1aa8..01a26521239 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -81,7 +81,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk("Hmm, FPU with FDIV bug.\n"); + printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); } static void __init check_hlt(void) @@ -98,7 +98,7 @@ static void __init check_hlt(void) halt(); halt(); halt(); - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); } /* @@ -122,9 +122,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk("Buggy.\n"); + printk(KERN_CONT "Buggy.\n"); else - printk("OK.\n"); + printk(KERN_CONT "OK.\n"); #endif } @@ -156,7 +156,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 9a3ed0649d4..04f0fe5af83 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -15,7 +15,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #if !defined(CONFIG_SMP) - printk("CPU: "); + printk(KERN_INFO "CPU: "); print_cpu_info(&boot_cpu_data); #endif alternative_instructions(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d6f27c92854..c96ea44928b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,8 +18,8 @@ #include #include #include -#include -#include +#include +#include #include #include #include @@ -28,13 +28,13 @@ #include #include #include -#include +#include #include #include #include #include #include -#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 593171e967e..19807b89f05 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -3,10 +3,10 @@ #include #include #include -#include +#include #include #include -#include +#include #include #include @@ -282,7 +282,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * The 5510/5520 companion chips have a funky PIT. */ if (vendor == PCI_VENDOR_ID_CYRIX && - (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif @@ -299,7 +300,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) * ? : 0x7x * GX1 : 0x8x GX1 datasheet 56 */ - if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); return; } else { /* MediaGX */ @@ -427,9 +429,12 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); local_irq_restore(flags); } } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index fb5b86af0b0..93ba8eeb100 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -28,11 +28,10 @@ static inline void __cpuinit detect_hypervisor_vendor(struct cpuinfo_x86 *c) { - if (vmware_platform()) { + if (vmware_platform()) c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - } else { + else c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; - } } unsigned long get_hypervisor_tsc_freq(void) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3260ab04499..80a722a071b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -7,17 +7,17 @@ #include #include #include +#include #include #include #include -#include #include #include #include #ifdef CONFIG_X86_64 -#include +#include #include #endif @@ -174,7 +174,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. * Note that the workaround only should be initialized once... */ c->f00f_bug = 0; @@ -207,7 +208,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); } } @@ -283,7 +284,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) /* Intel has a non-standard dependency on %ecx for this CPUID level. */ cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); if (eax & 0x1f) - return ((eax >> 26) + 1); + return (eax >> 26) + 1; else return 1; } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 789efe217e1..306bf0dca06 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -3,7 +3,7 @@ * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj : Work with CPU hotplug infrastructure. + * Ashok Raj : Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -16,7 +16,7 @@ #include #include -#include +#include #include #define LVL_1_INST 1 @@ -25,14 +25,15 @@ #define LVL_3 4 #define LVL_TRACE 5 -struct _cache_table -{ +struct _cache_table { unsigned char descriptor; char cache_type; short size; }; -/* all the cache descriptor types we care about (no TLB or trace cache entries) */ +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + static const struct _cache_table __cpuinitconst cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ @@ -105,8 +106,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = }; -enum _cache_type -{ +enum _cache_type { CACHE_TYPE_NULL = 0, CACHE_TYPE_DATA = 1, CACHE_TYPE_INST = 2, @@ -170,31 +170,31 @@ unsigned short num_cache_leaves; Maybe later */ union l1_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 8; - unsigned assoc : 8; - unsigned size_in_kb : 8; + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; }; unsigned val; }; union l2_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned size_in_kb : 16; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; }; unsigned val; }; union l3_cache { struct { - unsigned line_size : 8; - unsigned lines_per_tag : 4; - unsigned assoc : 4; - unsigned res : 2; - unsigned size_encoded : 14; + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; }; unsigned val; }; @@ -350,7 +350,8 @@ static int __cpuinit find_num_cache_leaves(void) unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; @@ -377,8 +378,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) retval = cpuid4_cache_lookup_regs(i, &this_leaf); if (retval >= 0) { - switch(this_leaf.eax.split.level) { - case 1: + switch (this_leaf.eax.split.level) { + case 1: if (this_leaf.eax.split.type == CACHE_TYPE_DATA) new_l1d = this_leaf.size/1024; @@ -386,19 +387,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) CACHE_TYPE_INST) new_l1i = this_leaf.size/1024; break; - case 2: + case 2: new_l2 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; break; - case 3: + case 3: new_l3 = this_leaf.size/1024; num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); + index_msb = get_count_order( + num_threads_sharing); l3_id = c->apicid >> index_msb; break; - default: + default: break; } } @@ -421,22 +423,21 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; - for ( i = 0 ; i < n ; i++ ) { + for (i = 0 ; i < n ; i++) { cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if (regs[j] & (1 << 31)) regs[j] = 0; - } + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { + for (j = 1 ; j < 16 ; j++) { unsigned char des = dp[j]; unsigned char k = 0; /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) - { + while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { if (only_trace && cache_table[k].cache_type != LVL_TRACE) break; @@ -488,14 +489,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) } if (trace) - printk (KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if ( l1i ) - printk (KERN_INFO "CPU: L1 I cache: %dK", l1i); + printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); + else if (l1i) + printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); if (l1d) - printk(", L1 D cache: %dK\n", l1d); + printk(KERN_CONT ", L1 D cache: %dK\n", l1d); else - printk("\n"); + printk(KERN_CONT "\n"); if (l2) printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); @@ -558,8 +559,13 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) {} -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) {} +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} #endif static void __cpuinit free_cache_attributes(unsigned int cpu) @@ -645,7 +651,7 @@ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); static ssize_t show_##file_name \ (struct _cpuid4_info *this_leaf, char *buf) \ { \ - return sprintf (buf, "%lu\n", (unsigned long)this_leaf->object + val); \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } show_one_plus(level, eax.split.level, 0); @@ -656,7 +662,7 @@ show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) { - return sprintf (buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -669,7 +675,7 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, const struct cpumask *mask; mask = to_cpumask(this_leaf->shared_cpu_map); - n = type? + n = type ? cpulist_scnprintf(buf, len-2, mask) : cpumask_scnprintf(buf, len-2, mask); buf[n++] = '\n'; @@ -800,7 +806,7 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -static struct attribute * default_attrs[] = { +static struct attribute *default_attrs[] = { &type.attr, &level.attr, &coherency_line_size.attr, @@ -815,7 +821,7 @@ static struct attribute * default_attrs[] = { NULL }; -static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -828,8 +834,8 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) return ret; } -static ssize_t store(struct kobject * kobj, struct attribute * attr, - const char * buf, size_t count) +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) { struct _cache_attr *fattr = to_attr(attr); struct _index_kobject *this_leaf = to_object(kobj); @@ -883,7 +889,7 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) goto err_out; per_cpu(index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); if (unlikely(per_cpu(index_kobject, cpu) == NULL)) goto err_out; @@ -917,7 +923,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu,i); + this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; retval = kobject_init_and_add(&(this_object->kobj), @@ -925,9 +931,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) per_cpu(cache_kobject, cpu), "index%1lu", i); if (unlikely(retval)) { - for (j = 0; j < i; j++) { - kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj)); - } + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); return retval; @@ -952,7 +957,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); } @@ -977,8 +982,7 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = -{ +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 5c481f6205b..8100a29c854 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -68,16 +68,16 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); + return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); + return msr - MSR_ARCH_PERFMON_PERFCTR0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_PERFCTR0); + return msr - MSR_P6_PERFCTR0; case 15: - return (msr - MSR_P4_BPU_PERFCTR0); + return msr - MSR_P4_BPU_PERFCTR0; } } return 0; @@ -92,16 +92,16 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); + return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + return msr - MSR_ARCH_PERFMON_EVENTSEL0; switch (boot_cpu_data.x86) { case 6: - return (msr - MSR_P6_EVNTSEL0); + return msr - MSR_P6_EVNTSEL0; case 15: - return (msr - MSR_P4_BSU_ESCR0); + return msr - MSR_P4_BSU_ESCR0; } } return 0; @@ -113,7 +113,7 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) { BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } /* checks the an msr for availability */ @@ -124,7 +124,7 @@ int avail_to_resrv_perfctr_nmi(unsigned int msr) counter = nmi_perfctr_msr_to_bit(msr); BUG_ON(counter > NMI_MAX_COUNTER_BITS); - return (!test_bit(counter, perfctr_nmi_owner)); + return !test_bit(counter, perfctr_nmi_owner); } EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); @@ -237,7 +237,7 @@ static unsigned int adjust_for_32bit_ctr(unsigned int hz) */ counter_val = (u64)cpu_khz * 1000; do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { + if (counter_val > 0x7fffffffULL) { u64 count = (u64)cpu_khz * 1000; do_div(count, 0x7fffffffUL); retval = count + 1; @@ -251,7 +251,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -262,7 +262,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); - if(descr) + if (descr) pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } @@ -296,7 +296,7 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -387,7 +387,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); + write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; @@ -415,7 +415,7 @@ static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) apic_write(APIC_LVTPC, APIC_DM_NMI); /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL,nmi_hz); + write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); } static const struct wd_ops p6_wd_ops = { @@ -490,9 +490,9 @@ static int setup_p4_watchdog(unsigned nmi_hz) if (smp_num_siblings == 2) { unsigned int ebx, apicid; - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; } else #endif ht_num = 0; @@ -544,7 +544,7 @@ static int setup_p4_watchdog(unsigned nmi_hz) } evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS + | P4_ESCR_OS | P4_ESCR_USR; cccr_val |= P4_CCCR_THRESHOLD(15) @@ -612,7 +612,7 @@ static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* - * P4 quirks: + * P4 quirks: * - An overflown perfctr will assert its interrupt * until the OVF flag in its CCCR is cleared. * - LVTPC is masked on interrupt and must be @@ -662,7 +662,8 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) * NOTE: Corresponding bit = 0 in ebx indicates event present. */ cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + if ((eax.split.mask_length < + (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) return 0; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index d5e30397246..1e904346bbf 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -128,7 +128,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) if (i < ARRAY_SIZE(x86_power_flags) && x86_power_flags[i]) seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", + x86_power_flags[i][0] ? " " : "", x86_power_flags[i]); else seq_printf(m, " [%d]", i); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 284c399e323..bc24f514ec9 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,17 +49,17 @@ static inline int __vmware_platform(void) static unsigned long __vmware_get_tsc_khz(void) { - uint64_t tsc_hz; - uint32_t eax, ebx, ecx, edx; + uint64_t tsc_hz; + uint32_t eax, ebx, ecx, edx; - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - return tsc_hz; + if (ebx == UINT_MAX) + return 0; + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + return tsc_hz; } /* -- cgit v1.2.3-70-g09d2 From 8045a4c293d36c61656a20d581b11f7f0cd7acd5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jul 2009 19:30:25 +0200 Subject: x86/oprofile: Fix cast of counter value When casting the counter value to a 64 bit value in 32 bit mode, sign extension may lead to broken counter values. This patch fixes this by casting to (u64) instead of (s64). Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 4 ++-- arch/x86/oprofile/op_model_p4.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index e95268eb922..7ca8306aefa 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -111,7 +111,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, if (counter_config[i].enabled && msrs->counters[i].addr) { reset_value[i] = counter_config[i].count; wrmsrl(msrs->counters[i].addr, - -(s64)counter_config[i].count); + -(u64)counter_config[i].count); rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; val |= op_x86_get_ctrl(model, &counter_config[i]); @@ -237,7 +237,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, if (val & OP_CTR_OVERFLOW) continue; oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -(s64)reset_value[i]); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[i]); } op_amd_handle_ibs(regs, msrs); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index f01e53b118f..9db9e361182 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -580,7 +580,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); wrmsrl(p4_counters[VIRT_CTR(stag, i)].counter_address, - -(s64)counter_config[i].count); + -(u64)counter_config[i].count); } else { reset_value[i] = 0; } @@ -625,11 +625,11 @@ static int p4_check_ctrs(struct pt_regs * const regs, if (CCCR_OVF_P(low) || !(ctr & OP_CTR_OVERFLOW)) { oprofile_add_sample(regs, i); wrmsrl(p4_counters[real].counter_address, - -(s64)reset_value[i]); + -(u64)reset_value[i]); CCCR_CLEAR_OVF(low); wrmsr(p4_counters[real].cccr_address, low, high); wrmsrl(p4_counters[real].counter_address, - -(s64)reset_value[i]); + -(u64)reset_value[i]); } } -- cgit v1.2.3-70-g09d2 From 44ab9a6b0e909145d42615493952fe986b1ce5c2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 18:33:02 +0200 Subject: x86/oprofile: Rework and simplify nmi_cpu_setup() This patch removes the function nmi_save_registers(). Per-cpu code is now executed only in the function nmi_cpu_setup(). Also, it renames the per-cpu function nmi_restore_registers() to nmi_cpu_restore_registers(). Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 93df76dd60f..25da1e17815 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -87,13 +87,6 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } -static void nmi_save_registers(void *dummy) -{ - int cpu = smp_processor_id(); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - nmi_cpu_save_registers(msrs); -} - static void free_msrs(void) { int i; @@ -137,6 +130,7 @@ static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); spin_unlock(&oprofilefs_lock); @@ -182,13 +176,12 @@ static int nmi_setup(void) } } - on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; struct op_msr *controls = msrs->controls; @@ -220,7 +213,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); } static void nmi_shutdown(void) -- cgit v1.2.3-70-g09d2 From 6e63ea4b0b14ff5fb8a3ca704fcda7d28b95f079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 7 Jul 2009 19:25:39 +0200 Subject: x86/oprofile: Whitespaces changes only This patch fixes whitespace changes of code that will be touched in follow-on patches. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 ++++++------ arch/x86/oprofile/op_model_amd.c | 12 ++++++------ arch/x86/oprofile/op_model_p4.c | 8 ++++---- arch/x86/oprofile/op_model_ppro.c | 8 ++++---- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 25da1e17815..fca8dc94531 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -516,12 +516,12 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 7ca8306aefa..f676f8825a3 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -91,7 +91,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, int i; /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); @@ -229,7 +229,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); @@ -250,7 +250,7 @@ static void op_amd_start(struct op_msrs const * const msrs) { u64 val; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -270,7 +270,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -285,11 +285,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9db9e361182..5921b7fc724 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -558,7 +558,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, } /* clear the cccrs we will use */ - for (i = 0 ; i < num_counters ; i++) { + for (i = 0; i < num_counters; i++) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); @@ -575,7 +575,7 @@ static void p4_setup_ctrs(struct op_x86_model_spec const *model, } /* setup all counters */ - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (counter_config[i].enabled && msrs->controls[i].addr) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); @@ -678,7 +678,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(msrs->counters[i].addr); } @@ -687,7 +687,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ - for (i = num_counters ; i < num_controls ; ++i) { + for (i = num_counters; i < num_controls; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(msrs->controls[i].addr); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index cd72d5c73b4..570d717c330 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -81,7 +81,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!msrs->controls[i].addr)) continue; rdmsrl(msrs->controls[i].addr, val); @@ -125,7 +125,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, if (unlikely(!reset_value)) goto out; - for (i = 0 ; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; rdmsrl(msrs->counters[i].addr, val); @@ -188,11 +188,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->counters[i].addr) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < num_counters ; ++i) { + for (i = 0; i < num_counters; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } -- cgit v1.2.3-70-g09d2 From 6b2b171a774af256082635b53ac387b1613b7b4c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:53:50 -0700 Subject: x86/acpi: acpi_parse_madt_ioapic_entries: remove redundant braces We don't put braces around a single statement. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/acpi/boot.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 6b8ca3a0285..ce31c1af854 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1179,9 +1179,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * If MPS is present, it will handle them, * otherwise the system will stay in PIC mode */ - if (acpi_disabled || acpi_noirq) { + if (acpi_disabled || acpi_noirq) return -ENODEV; - } if (!cpu_has_apic) return -ENODEV; -- cgit v1.2.3-70-g09d2 From 2f210deba9887dd9143b63b217506f1ac152e91c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:55:22 -0700 Subject: x86/ioapic.c: ioapic_modify_irq is too large to inline If ioapic_modify_irq() is marked inline, it gets inlined several times. Un-inlining it saves around 200 bytes in .text for me. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 90b5e6efa93..82271eb87bb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -556,9 +556,9 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, add_pin_to_irq_node(cfg, node, newapic, newpin); } -static inline void io_apic_modify_irq(struct irq_cfg *cfg, - int mask_and, int mask_or, - void (*final)(struct irq_pin_list *entry)) +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) { int pin; struct irq_pin_list *entry; -- cgit v1.2.3-70-g09d2 From 890aeacf64c55a7ada7054a140d249ab13899f2d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 02:57:43 -0700 Subject: x86/ioapic.c: unify __mask_IO_APIC_irq() The main difference between 32 and 64-bit __mask_IO_APIC_irq() does a readback from the I/O APIC to synchronize it. If there's a hardware requirement to do a readback sync after updating an APIC register, then it will be a hardware requrement regardless of whether the kernel is compiled 32 or 64-bit. Unify __mask_IO_APIC_irq() using the 64-bit version which always syncs with io_apic_sync(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 82271eb87bb..f8aa5461071 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -580,7 +580,6 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); } -#ifdef CONFIG_X86_64 static void io_apic_sync(struct irq_pin_list *entry) { /* @@ -596,12 +595,8 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL); -} +#ifdef CONFIG_X86_32 static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, -- cgit v1.2.3-70-g09d2 From 916a0fe739f151664f7f07b42543ae6fd4caec49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:00:22 -0700 Subject: x86/ioapic.c: remove #ifdef for 82093AA workaround While no 64-bit hardware will have a version 0x11 I/O APIC which needs the level/edge bug workaround, that's not a particular reason to use CONFIG_X86_32 to #ifdef the code out. Most 32-bit machines will no longer need the workaround either, so the test to see whether it is necessary should be more fine-grained than "32-bit=yes, 64-bit=no". (Also fix formatting of block comment.) Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 47 +++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f8aa5461071..1a341444258 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -596,7 +596,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -#ifdef CONFIG_X86_32 static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) { io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, @@ -608,7 +607,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, IO_APIC_REDIR_LEVEL_TRIGGER, NULL); } -#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { @@ -2510,11 +2508,8 @@ atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - -#ifdef CONFIG_X86_32 unsigned long v; int i; -#endif struct irq_cfg *cfg; int do_unmask_irq = 0; @@ -2527,31 +2522,28 @@ static void ack_apic_level(unsigned int irq) } #endif -#ifdef CONFIG_X86_32 /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ cfg = desc->chip_data; i = cfg->vector; - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); -#endif /* * We must acknowledge the irq before we move it or the acknowledge will @@ -2593,7 +2585,7 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq_desc(desc); } -#ifdef CONFIG_X86_32 + /* Tail end of version 0x11 I/O APIC bug workaround */ if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2601,7 +2593,6 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(cfg); spin_unlock(&ioapic_lock); } -#endif } #ifdef CONFIG_INTR_REMAP -- cgit v1.2.3-70-g09d2 From 83c21bedf63ce92a2dd82ae2c7a96179b0aa4372 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:13:04 -0700 Subject: x86/ioapic.c: remove redundant declaration of irq_pin_list The structure is defined immediately below, so there's no need to forward declare it. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1a341444258..ec52e0c045c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -116,8 +116,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_pin_list; - /* * This is performance-critical, we want to do it O(1) * -- cgit v1.2.3-70-g09d2 From 8e13d697febc1ba17e70ed88789255c8bc25aa41 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:14:59 -0700 Subject: x86/ioapic.c: move lost comment to what seems like appropriate place The comment got separated from its subject, so move it to what appears to be the right place, and update to describe the current structure. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ec52e0c045c..a097a773bc7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -116,13 +116,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -137,6 +130,11 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) return pin; } +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ struct irq_cfg { struct irq_pin_list *irq_2_pin; cpumask_var_t domain; -- cgit v1.2.3-70-g09d2 From d8c52063ed85dda61b70bc05b90711478db5dc17 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:17:58 -0700 Subject: x86/ioapic.c: convert io_apic_level_ack_pending loop to normal for() loop Convert the unconventional loop in io_apic_level_ack_pending() to a conventional for() loop. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index a097a773bc7..0d0401802d4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -410,13 +410,10 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; int pin; - if (!entry) - break; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ @@ -424,9 +421,6 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) spin_unlock_irqrestore(&ioapic_lock, flags); return true; } - if (!entry->next) - break; - entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); -- cgit v1.2.3-70-g09d2 From 875e68ec32fc5495f3edf987aaae1c52306184b7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:24:11 -0700 Subject: x86/ioapic.c: simplify add_pin_to_irq_node() Rather than duplicating the same alloc/init code twice, restructure the function to look for duplicates and then add an entry if none is found. This function is not performance critical; all but one of its callers are __init functions, and the non-__init caller is for PCI device setup. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0d0401802d4..d9e8f19088d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -490,34 +490,22 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list *entry; + struct irq_pin_list **entryp, *entry; - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(node); - if (!entry) { - printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", - apic, pin); - return; - } - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - return; - } - - while (entry->next) { + for (entryp = &cfg->irq_2_pin; + *entryp != NULL; + entryp = &(*entryp)->next) { + entry = *entryp; /* not again, please */ if (entry->apic == apic && entry->pin == pin) return; - - entry = entry->next; } - entry->next = get_one_free_irq_2_pin(node); - entry = entry->next; + entry = get_one_free_irq_2_pin(node); entry->apic = apic; entry->pin = pin; + + *entryp = entry; } /* -- cgit v1.2.3-70-g09d2 From 535b64291a9d1ff8bc54642494a5fce27e1e1170 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:29:26 -0700 Subject: x86/ioapic.c: convert replace_pin_at_irq_node to conventional for() loop Use a conventional for() loop in replace_pin_at_irq_node(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9e8f19088d..9386976b675 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -515,10 +515,10 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = cfg->irq_2_pin; + struct irq_pin_list *entry; int replaced = 0; - while (entry) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -526,7 +526,6 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, /* every one is different, right? */ break; } - entry = entry->next; } /* why? call replace before add? */ -- cgit v1.2.3-70-g09d2 From 4eea6fff612f54380dd642b045bf03ac0613fe3e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:32:15 -0700 Subject: x86/ioapic.c: clean up replace_pin_at_irq_node logic and comments There's no need for a control variable in replace_pin_at_irq_node(); it can just return if it finds the old apic/pin to replace. If the loop terminates, then it didn't find the old apic/pin, so it can add the new ones. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9386976b675..8245e62ed93 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -512,25 +512,22 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin * Reroute an IRQ to a different pin. */ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, - int oldapic, int oldpin, - int newapic, int newpin) + int oldapic, int oldpin, + int newapic, int newpin) { struct irq_pin_list *entry; - int replaced = 0; for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - replaced = 1; /* every one is different, right? */ - break; + return; } } - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq_node(cfg, node, newapic, newpin); + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); } static void io_apic_modify_irq(struct irq_cfg *cfg, -- cgit v1.2.3-70-g09d2 From 638f2f8c52a92c15ebda9e50d84c1ab56fc42e42 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:37:52 -0700 Subject: x86/ioapic.c: convert __target_IO_APIC_irq to conventional for() loop Use a normal for() loop in __target_IO_APIC_irq(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8245e62ed93..17883cd8259 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2236,13 +2236,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - entry = cfg->irq_2_pin; - for (;;) { + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { unsigned int reg; - if (!entry) - break; - apic = entry->apic; pin = entry->pin; /* @@ -2255,9 +2251,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; } } -- cgit v1.2.3-70-g09d2 From e25371d60cb06a44d7a32d7966ab9bfbeacb9390 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Jun 2009 03:49:01 -0700 Subject: x86/ioapic.c: unify ioapic_retrigger_irq() The 32 and 64-bit versions of ioapic_retrigger_irq() are identical except the 64-bit one takes vector_lock. vector_lock is defined and used on 32-bit too, so just use a common ioapic_retrigger_irq(). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/apic/io_apic.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 17883cd8259..cf51b0b58c5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2178,7 +2178,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { @@ -2191,14 +2190,6 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - apic->send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif /* * Level and edge triggered IO-APIC interrupts need different handling, -- cgit v1.2.3-70-g09d2 From 254e0a6bff87ab8b22293c4bd1443507df698407 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:08:54 +0900 Subject: x86: Use get_desc_base() Use get_desc_base() to get the base address in desc_struct Signed-off-by: Akinobu Mita LKML-Reference: <20090718150853.GA11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 4 +--- arch/x86/kernel/step.c | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index b4f14c6c09d..37250fe490b 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -27,9 +27,7 @@ static void doublefault_fn(void) if (ptr_ok(gdt)) { gdt += GDT_ENTRY_TSS << 3; - tss = *(u16 *)(gdt+2); - tss += *(u8 *)(gdt+4) << 16; - tss += *(u8 *)(gdt+7) << 24; + tss = get_desc_base((struct desc_struct *)gdt); printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index e8b9863ef8c..3149032ff10 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -4,6 +4,7 @@ #include #include #include +#include unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) { @@ -23,7 +24,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re * and APM bios ones we just ignore here. */ if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { - u32 *desc; + struct desc_struct *desc; unsigned long base; seg &= ~7UL; @@ -33,12 +34,10 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re addr = -1L; /* bogus selector, access would fault */ else { desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); + base = get_desc_base(desc); /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) + if (!desc->d) addr &= 0xffff; addr += base; } -- cgit v1.2.3-70-g09d2 From fde0312d01b60a3fd5dc56e69a9613defbbc7097 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:09:56 +0900 Subject: x86: Remove unused patch_espfix_desc() patch_espfix_desc() is not used after commit dc4c2a0aed3b09f6e255bd5c3faa50fe6e0b2ded Signed-off-by: Akinobu Mita LKML-Reference: <20090718150955.GB11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 4 +--- arch/x86/kernel/traps.c | 21 --------------------- 2 files changed, 1 insertion(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index bfd74c032fc..4da91ad69e0 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -81,9 +81,7 @@ extern int panic_on_unrecovered_nmi; void math_error(void __user *); void math_emulate(struct math_emu_info *); -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long, unsigned long); -#else +#ifndef CONFIG_X86_32 asmlinkage void smp_thermal_interrupt(void); asmlinkage void mce_threshold_interrupt(void); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 5204332f475..23679411020 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -786,27 +786,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#endif - asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } -- cgit v1.2.3-70-g09d2 From 57594742a2b545f8f114cda34f15650be8ae976d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:11:06 +0900 Subject: x86: Introduce set_desc_base() and set_desc_limit() Rename set_base()/set_limit to set_desc_base()/set_desc_limit() and rewrite them in C. These are naturally introduced by the idea of get_desc_base()/get_desc_limit(). The conversion actually found the bug in apm_32.c: bad_bios_desc is written at run-time, but it is defined const variable. Signed-off-by: Akinobu Mita LKML-Reference: <20090718151105.GC11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 13 +++++++++++++ arch/x86/include/asm/stackprotector.h | 4 +--- arch/x86/include/asm/system.h | 27 --------------------------- arch/x86/kernel/apm_32.c | 18 +++++++++--------- drivers/pnp/pnpbios/bioscalls.c | 21 +++++++++++---------- 5 files changed, 34 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index c993e9e0fed..e8de2f6f5ca 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -291,11 +291,24 @@ static inline unsigned long get_desc_base(const struct desc_struct *desc) return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24); } +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + static inline unsigned long get_desc_limit(const struct desc_struct *desc) { return desc->limit0 | (desc->limit << 16); } +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2d742c6e15..cdc5e0b126a 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -90,9 +90,7 @@ static inline void setup_stack_canary_segment(int cpu) struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; - desc.base0 = canary & 0xffff; - desc.base1 = (canary >> 16) & 0xff; - desc.base2 = (canary >> 24) & 0xff; + set_desc_base(&desc, canary); write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); #endif } diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 643c59b4bc6..75c49c782e2 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -150,33 +150,6 @@ do { \ #endif #ifdef __KERNEL__ -#define _set_base(addr, base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while (0) - -#define _set_limit(addr, limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while (0) - -#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base)) -#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1)) extern void native_load_gs_index(unsigned); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 79302e9a33a..b5e841bd60d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; +static struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2337,8 +2337,8 @@ static int __init apm_init(void) * This is for buggy BIOS's that refer to (real mode) segment 0x40 * even though they are called in protected mode. */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); + set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2358,12 +2358,12 @@ static int __init apm_init(void) * code to that CPU. */ gdt = get_cpu_gdt_table(0); - set_base(gdt[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(gdt[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(gdt[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); proc_create("apm", 0, NULL, &apm_file_ops); diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 7e6b5a3b328..45ad3e9cc36 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -55,9 +55,9 @@ __asm__(".text \n" #define Q2_SET_SEL(cpu, selname, address, size) \ do { \ -struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ -set_base(gdt[(selname) >> 3], (u32)(address)); \ -set_limit(gdt[(selname) >> 3], size); \ + struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ + set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ + set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) static struct desc_struct bad_bios_desc; @@ -479,16 +479,17 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) bad_bios_desc.a = 0; bad_bios_desc.b = 0x00409200; - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); + set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { struct desc_struct *gdt = get_cpu_gdt_table(i); if (!gdt) continue; - set_base(gdt[GDT_ENTRY_PNPBIOS_CS32], &pnp_bios_callfunc); - set_base(gdt[GDT_ENTRY_PNPBIOS_CS16], - __va(header->fields.pm16cseg)); - set_base(gdt[GDT_ENTRY_PNPBIOS_DS], - __va(header->fields.pm16dseg)); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], + (unsigned long)&pnp_bios_callfunc); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS16], + (unsigned long)__va(header->fields.pm16cseg)); + set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_DS], + (unsigned long)__va(header->fields.pm16dseg)); } } -- cgit v1.2.3-70-g09d2 From 4d4036e0e7299c6cbb2d2421b4b30b7a409ce61a Mon Sep 17 00:00:00 2001 From: Jason Yeh Date: Wed, 8 Jul 2009 13:49:38 +0200 Subject: oprofile: Implement performance counter multiplexing The number of hardware counters is limited. The multiplexing feature enables OProfile to gather more events than counters are provided by the hardware. This is realized by switching between events at an user specified time interval. A new file (/dev/oprofile/time_slice) is added for the user to specify the timer interval in ms. If the number of events to profile is higher than the number of hardware counters available, the patch will schedule a work queue that switches the event counter and re-writes the different sets of values into it. The switching mechanism needs to be implemented for each architecture to support multiplexing. This patch only implements AMD CPU support, but multiplexing can be easily extended for other models and architectures. There are follow-on patches that rework parts of this patch. Signed-off-by: Jason Yeh Signed-off-by: Robert Richter --- arch/Kconfig | 12 +++ arch/x86/oprofile/nmi_int.c | 162 ++++++++++++++++++++++++++++++++++++-- arch/x86/oprofile/op_counter.h | 2 +- arch/x86/oprofile/op_model_amd.c | 110 ++++++++++++++++++++++---- arch/x86/oprofile/op_model_p4.c | 4 + arch/x86/oprofile/op_model_ppro.c | 2 + arch/x86/oprofile/op_x86_model.h | 7 ++ drivers/oprofile/oprof.c | 78 ++++++++++++++++++ drivers/oprofile/oprof.h | 2 + drivers/oprofile/oprofile_files.c | 43 ++++++++++ drivers/oprofile/oprofile_stats.c | 10 +++ include/linux/oprofile.h | 3 + 12 files changed, 415 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/Kconfig b/arch/Kconfig index 99193b16023..beea3ccebb5 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -30,6 +30,18 @@ config OPROFILE_IBS If unsure, say N. +config OPROFILE_EVENT_MULTIPLEX + bool "OProfile multiplexing support (EXPERIMENTAL)" + default n + depends on OPROFILE && X86 + help + The number of hardware counters is limited. The multiplexing + feature enables OProfile to gather more events than counters + are provided by the hardware. This is realized by switching + between events at an user specified time interval. + + If unsure, say N. + config HAVE_OPROFILE bool diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index fca8dc94531..e54f6a0b35a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,11 +1,14 @@ /** * @file nmi_int.c * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Robert Richter + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit */ #include @@ -24,6 +27,12 @@ #include "op_counter.h" #include "op_x86_model.h" + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +DEFINE_PER_CPU(int, switch_index); +#endif + + static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -31,6 +40,13 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +extern atomic_t multiplex_counter; +#endif + +struct op_counter_config counter_config[OP_MAX_COUNTER]; + /* common functions */ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, @@ -95,6 +111,11 @@ static void free_msrs(void) per_cpu(cpu_msrs, i).counters = NULL; kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; +#endif } } @@ -103,6 +124,9 @@ static int allocate_msrs(void) int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; +#endif int i; for_each_possible_cpu(i) { @@ -118,6 +142,14 @@ static int allocate_msrs(void) success = 0; break; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) { + success = 0; + break; + } +#endif } if (!success) @@ -126,6 +158,25 @@ static int allocate_msrs(void) return success; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; + } + } +} + +#endif + static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); @@ -133,6 +184,9 @@ static void nmi_cpu_setup(void *dummy) nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + nmi_setup_cpu_mux(msrs); +#endif spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -173,14 +227,52 @@ static int nmi_setup(void) memcpy(per_cpu(cpu_msrs, cpu).controls, per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); +#endif } - } on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; return 0; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + struct op_msr *multiplex = msrs->multiplex; + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + int offset = i + si; + if (multiplex[offset].addr) { + rdmsrl(multiplex[offset].addr, + multiplex[offset].saved); + } + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + struct op_msr *multiplex = msrs->multiplex; + unsigned int i; + + for (i = 0; i < model->num_counters; ++i) { + int offset = i + si; + if (multiplex[offset].addr) { + wrmsrl(multiplex[offset].addr, + multiplex[offset].saved); + } + } +} + +#endif + static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -214,6 +306,9 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + __get_cpu_var(switch_index) = 0; +#endif } static void nmi_shutdown(void) @@ -252,16 +347,15 @@ static void nmi_stop(void) on_each_cpu(nmi_cpu_stop, NULL, 1); } -struct op_counter_config counter_config[OP_MAX_COUNTER]; - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; - for (i = 0; i < model->num_counters; ++i) { + for (i = 0; i < model->num_virt_counters; ++i) { struct dentry *dir; char buf[4]; +#ifndef CONFIG_OPROFILE_EVENT_MULTIPLEX /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized @@ -269,6 +363,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) */ if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) continue; +#endif /* CONFIG_OPROFILE_EVENT_MULTIPLEX */ snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); @@ -283,6 +378,57 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!model->switch_ctrl) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + atomic_inc(&multiplex_counter); + + return 0; +} + +#endif + #ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) @@ -516,12 +662,18 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + __raw_get_cpu_var(switch_index) = 0; +#endif ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + ops->switch_events = nmi_switch_event; +#endif if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 91b6a116165..e28398df0df 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,7 +10,7 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index f676f8825a3..fdbed3a0c87 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -9,12 +9,15 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter - * @author Barry Kasindorf + * @author Barry Kasindorf + * @author Jason Yeh + * @author Suravee Suthikulpanit */ #include #include #include +#include #include #include @@ -25,12 +28,23 @@ #define NUM_COUNTERS 4 #define NUM_CONTROLS 4 +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_CONTROLS 32 +#else +#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_CONTROLS NUM_CONTROLS +#endif + #define OP_EVENT_MASK 0x0FFF #define OP_CTR_OVERFLOW (1ULL<<31) #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_COUNTERS]; +static unsigned long reset_value[NUM_VIRT_COUNTERS]; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +DECLARE_PER_CPU(int, switch_index); +#endif #ifdef CONFIG_OPROFILE_IBS @@ -82,6 +96,16 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) else msrs->controls[i].addr = 0; } + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = i % NUM_CONTROLS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +#endif } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -90,6 +114,15 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, u64 val; int i; + /* setup reset_value */ + for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + if (counter_config[i].enabled) { + reset_value[i] = counter_config[i].count; + } else { + reset_value[i] = 0; + } + } + /* clear all counters */ for (i = 0; i < NUM_CONTROLS; ++i) { if (unlikely(!msrs->controls[i].addr)) @@ -108,20 +141,49 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - if (counter_config[i].enabled && msrs->counters[i].addr) { - reset_value[i] = counter_config[i].count; - wrmsrl(msrs->counters[i].addr, - -(u64)counter_config[i].count); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (counter_config[offset].enabled && msrs->counters[i].addr) { + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); + + /* setup control registers */ rdmsrl(msrs->controls[i].addr, val); val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[i]); + val |= op_x86_get_ctrl(model, &counter_config[offset]); + wrmsrl(msrs->controls[i].addr, val); + } + } +} + + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (counter_config[offset].enabled) { + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[offset]); wrmsrl(msrs->controls[i].addr, val); - } else { - reset_value[i] = 0; } } } +#endif + + #ifdef CONFIG_OPROFILE_IBS static inline int @@ -230,14 +292,19 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, int i; for (i = 0; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (!reset_value[offset]) continue; rdmsrl(msrs->counters[i].addr, val); /* bit is clear if overflowed: */ if (val & OP_CTR_OVERFLOW) continue; - oprofile_add_sample(regs, i); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[i]); + oprofile_add_sample(regs, offset); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); } op_amd_handle_ibs(regs, msrs); @@ -250,8 +317,14 @@ static void op_amd_start(struct op_msrs const * const msrs) { u64 val; int i; + for (i = 0; i < NUM_COUNTERS; ++i) { - if (reset_value[i]) { +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + int offset = i + __get_cpu_var(switch_index); +#else + int offset = i; +#endif + if (reset_value[offset]) { rdmsrl(msrs->controls[i].addr, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(msrs->controls[i].addr, val); @@ -271,7 +344,11 @@ static void op_amd_stop(struct op_msrs const * const msrs) * pm callback */ for (i = 0; i < NUM_COUNTERS; ++i) { +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) +#else if (!reset_value[i]) +#endif continue; rdmsrl(msrs->controls[i].addr, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -289,7 +366,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0; i < NUM_CONTROLS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -463,6 +540,8 @@ static void op_amd_exit(void) {} struct op_x86_model_spec const op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_virt_counters = NUM_VIRT_COUNTERS, + .num_virt_controls = NUM_VIRT_CONTROLS, .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, @@ -473,4 +552,7 @@ struct op_x86_model_spec const op_amd_spec = { .start = &op_amd_start, .stop = &op_amd_stop, .shutdown = &op_amd_shutdown, +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + .switch_ctrl = &op_amd_switch_ctrl, +#endif }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 5921b7fc724..65b9237cde8 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,6 +698,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, + .num_virt_counters = NUM_COUNTERS_HT2, + .num_virt_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -710,6 +712,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, + .num_virt_counters = NUM_COUNTERS_NON_HT, + .num_virt_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 570d717c330..098cbca5c0b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -206,6 +206,8 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = 2, .num_controls = 2, + .num_virt_counters = 2, + .num_virt_controls = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 505489873b9..0d07d23cb06 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -23,6 +23,7 @@ struct op_msr { struct op_msrs { struct op_msr *counters; struct op_msr *controls; + struct op_msr *multiplex; }; struct pt_regs; @@ -35,6 +36,8 @@ struct oprofile_operations; struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; + unsigned int num_virt_counters; + unsigned int num_virt_controls; u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); @@ -47,6 +50,10 @@ struct op_x86_model_spec { void (*start)(struct op_msrs const * const msrs); void (*stop)(struct op_msrs const * const msrs); void (*shutdown)(struct op_msrs const * const msrs); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + void (*switch_ctrl)(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs); +#endif }; struct op_counter_config; diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index 3cffce90f82..7bc64af7cf9 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include "oprof.h" @@ -27,6 +29,15 @@ unsigned long oprofile_backtrace_depth; static unsigned long is_setup; static DEFINE_MUTEX(start_mutex); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void switch_worker(struct work_struct *work); +static DECLARE_DELAYED_WORK(switch_work, switch_worker); +unsigned long timeout_jiffies; +#define MULTIPLEXING_TIMER_DEFAULT 1 + +#endif + /* timer 0 - use performance monitoring hardware if available 1 - use the timer int mechanism regardless @@ -87,6 +98,20 @@ out: return err; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void start_switch_worker(void) +{ + schedule_delayed_work(&switch_work, timeout_jiffies); +} + +static void switch_worker(struct work_struct *work) +{ + if (!oprofile_ops.switch_events()) + start_switch_worker(); +} + +#endif /* Actually start profiling (echo 1>/dev/oprofile/enable) */ int oprofile_start(void) @@ -108,6 +133,11 @@ int oprofile_start(void) if ((err = oprofile_ops.start())) goto out; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + if (oprofile_ops.switch_events) + start_switch_worker(); +#endif + oprofile_started = 1; out: mutex_unlock(&start_mutex); @@ -123,6 +153,11 @@ void oprofile_stop(void) goto out; oprofile_ops.stop(); oprofile_started = 0; + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + cancel_delayed_work_sync(&switch_work); +#endif + /* wake up the daemon to read what remains */ wake_up_buffer_waiter(); out: @@ -155,6 +190,36 @@ post_sync: mutex_unlock(&start_mutex); } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +/* User inputs in ms, converts to jiffies */ +int oprofile_set_timeout(unsigned long val_msec) +{ + int err = 0; + + mutex_lock(&start_mutex); + + if (oprofile_started) { + err = -EBUSY; + goto out; + } + + if (!oprofile_ops.switch_events) { + err = -EINVAL; + goto out; + } + + timeout_jiffies = msecs_to_jiffies(val_msec); + if (timeout_jiffies == MAX_JIFFY_OFFSET) + timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); + +out: + mutex_unlock(&start_mutex); + return err; + +} + +#endif int oprofile_set_backtrace(unsigned long val) { @@ -179,10 +244,23 @@ out: return err; } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void __init oprofile_multiplexing_init(void) +{ + timeout_jiffies = msecs_to_jiffies(MULTIPLEXING_TIMER_DEFAULT); +} + +#endif + static int __init oprofile_init(void) { int err; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofile_multiplexing_init(); +#endif + err = oprofile_arch_init(&oprofile_ops); if (err < 0 || timer) { diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index c288d3c24b5..ee38abcc74f 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -27,6 +27,7 @@ extern unsigned long oprofile_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; extern unsigned long oprofile_backtrace_depth; +extern unsigned long timeout_jiffies; struct super_block; struct dentry; @@ -35,5 +36,6 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root); void oprofile_timer_init(struct oprofile_operations *ops); int oprofile_set_backtrace(unsigned long depth); +int oprofile_set_timeout(unsigned long time); #endif /* OPROF_H */ diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index 5d36ffc30dd..468ec3e4f85 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -9,6 +9,7 @@ #include #include +#include #include "event_buffer.h" #include "oprofile_stats.h" @@ -22,6 +23,45 @@ unsigned long oprofile_buffer_size; unsigned long oprofile_cpu_buffer_size; unsigned long oprofile_buffer_watershed; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static ssize_t timeout_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(jiffies_to_msecs(timeout_jiffies), + buf, count, offset); +} + + +static ssize_t timeout_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + retval = oprofile_set_timeout(val); + + if (retval) + return retval; + return count; +} + + +static const struct file_operations timeout_fops = { + .read = timeout_read, + .write = timeout_write, +}; + +#endif + + static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { return oprofilefs_ulong_to_user(oprofile_backtrace_depth, buf, count, @@ -139,6 +179,9 @@ void oprofile_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofilefs_create_file(sb, root, "time_slice", &timeout_fops); +#endif oprofile_create_stats_files(sb, root); if (oprofile_ops.create_files) oprofile_ops.create_files(sb, root); diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index 3c2270a8300..77a57a6792f 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -16,6 +16,9 @@ #include "cpu_buffer.h" struct oprofile_stat_struct oprofile_stats; +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX +atomic_t multiplex_counter; +#endif void oprofile_reset_stats(void) { @@ -34,6 +37,9 @@ void oprofile_reset_stats(void) atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); atomic_set(&oprofile_stats.event_lost_overflow, 0); atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + atomic_set(&multiplex_counter, 0); +#endif } @@ -76,4 +82,8 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root) &oprofile_stats.event_lost_overflow); oprofilefs_create_ro_atomic(sb, dir, "bt_lost_no_mapping", &oprofile_stats.bt_lost_no_mapping); +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter", + &multiplex_counter); +#endif } diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index d68d2ed94f1..5171639ecf0 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -67,6 +67,9 @@ struct oprofile_operations { /* Initiate a stack backtrace. Optional. */ void (*backtrace)(struct pt_regs * const regs, unsigned int depth); + + /* Multiplex between different events. Optional. */ + int (*switch_events)(void); /* CPU identification string. */ char * cpu_type; }; -- cgit v1.2.3-70-g09d2 From 5e766e3e433fa2d5d2fdfd8e2432804c91393387 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 8 Jul 2009 14:54:17 +0200 Subject: x86/oprofile: Fix usage of NUM_CONTROLS/NUM_COUNTERS macros Use the corresponding macros when iterating over counter and control registers. Since NUM_CONTROLS and NUM_COUNTERS are equal for AMD cpus the fix is more a cosmetical change. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index fdbed3a0c87..dcfd4505cac 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -99,7 +99,7 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_CONTROLS; + int hw_counter = i % NUM_COUNTERS; if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; else @@ -366,7 +366,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) if (msrs->counters[i].addr) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_CONTROLS; ++i) { if (msrs->controls[i].addr) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } -- cgit v1.2.3-70-g09d2 From 82a225283fb0d9438549595d9e6f3ecc42b42ad6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 16:29:34 +0200 Subject: x86/oprofile: Use per_cpu() instead of __get_cpu_var() __get_cpu_var() calls smp_processor_id(). When the cpu id is already known, instead use per_cpu() to avoid generating the id again. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e54f6a0b35a..8cd4658370b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -294,7 +294,7 @@ static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs *msrs = &__get_cpu_var(cpu_msrs); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on @@ -307,7 +307,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - __get_cpu_var(switch_index) = 0; + per_cpu(switch_index, cpu) = 0; #endif } -- cgit v1.2.3-70-g09d2 From 6bfccd099c2841e1c42530f1b6d2553bfa13be3a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 19:23:50 +0200 Subject: x86/oprofile: Fix initialization of switch_index Variable switch_index must be initialized for each cpu. This patch fixes the initialization by moving it to the per-cpu init function nmi_cpu_setup(). Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 8cd4658370b..b211d335e07 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -160,7 +160,7 @@ static int allocate_msrs(void) #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { int i; struct op_msr *multiplex = msrs->multiplex; @@ -173,8 +173,15 @@ static void nmi_setup_cpu_mux(struct op_msrs const * const msrs) multiplex[i].saved = 0; } } + + per_cpu(switch_index, cpu) = 0; } +#else + +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } + #endif static void nmi_cpu_setup(void *dummy) @@ -184,9 +191,7 @@ static void nmi_cpu_setup(void *dummy) nmi_cpu_save_registers(msrs); spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - nmi_setup_cpu_mux(msrs); -#endif + nmi_cpu_setup_mux(cpu, msrs); spin_unlock(&oprofilefs_lock); per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC); apic_write(APIC_LVTPC, APIC_DM_NMI); @@ -662,9 +667,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) register_cpu_notifier(&oprofile_cpu_nb); #endif /* default values, can be overwritten by model */ -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - __raw_get_cpu_var(switch_index) = 0; -#endif ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; -- cgit v1.2.3-70-g09d2 From d8471ad3ab613a1ba7abd3aad46659de39a2871c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:04:43 +0200 Subject: oprofile: Introduce op_x86_phys_to_virt() This new function translates physical to virtual counter numbers. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 43 +++++++++++---------- arch/x86/oprofile/op_model_amd.c | 80 ++++++++++++++++------------------------ arch/x86/oprofile/op_x86_model.h | 1 + 3 files changed, 55 insertions(+), 69 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b211d335e07..02b57b8d0e6 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -27,12 +27,6 @@ #include "op_counter.h" #include "op_x86_model.h" - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -DEFINE_PER_CPU(int, switch_index); -#endif - - static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -103,6 +97,21 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static DEFINE_PER_CPU(int, switch_index); + +inline int op_x86_phys_to_virt(int phys) +{ + return __get_cpu_var(switch_index) + phys; +} + +#else + +inline int op_x86_phys_to_virt(int phys) { return phys; } + +#endif + static void free_msrs(void) { int i; @@ -248,31 +257,25 @@ static int nmi_setup(void) static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { - unsigned int si = __get_cpu_var(switch_index); struct op_msr *multiplex = msrs->multiplex; - unsigned int i; + int i; for (i = 0; i < model->num_counters; ++i) { - int offset = i + si; - if (multiplex[offset].addr) { - rdmsrl(multiplex[offset].addr, - multiplex[offset].saved); - } + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { - unsigned int si = __get_cpu_var(switch_index); struct op_msr *multiplex = msrs->multiplex; - unsigned int i; + int i; for (i = 0; i < model->num_counters; ++i) { - int offset = i + si; - if (multiplex[offset].addr) { - wrmsrl(multiplex[offset].addr, - multiplex[offset].saved); - } + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); } } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index dcfd4505cac..67f830d12e0 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -42,9 +42,6 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) static unsigned long reset_value[NUM_VIRT_COUNTERS]; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -DECLARE_PER_CPU(int, switch_index); -#endif #ifdef CONFIG_OPROFILE_IBS @@ -141,21 +138,20 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (counter_config[offset].enabled && msrs->counters[i].addr) { - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); - - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[offset]); - wrmsrl(msrs->controls[i].addr, val); - } + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + if (!msrs->counters[i].addr) + continue; + + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } @@ -170,14 +166,13 @@ static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, /* enable active counters */ for (i = 0; i < NUM_COUNTERS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (counter_config[offset].enabled) { - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[offset]); - wrmsrl(msrs->controls[i].addr, val); - } + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); } } @@ -292,19 +287,15 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, int i; for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (!reset_value[offset]) + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) continue; rdmsrl(msrs->counters[i].addr, val); /* bit is clear if overflowed: */ if (val & OP_CTR_OVERFLOW) continue; - oprofile_add_sample(regs, offset); - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[offset]); + oprofile_add_sample(regs, virt); + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); } op_amd_handle_ibs(regs, msrs); @@ -319,16 +310,11 @@ static void op_amd_start(struct op_msrs const * const msrs) int i; for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - int offset = i + __get_cpu_var(switch_index); -#else - int offset = i; -#endif - if (reset_value[offset]) { - rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(msrs->controls[i].addr, val); - } + if (!reset_value[op_x86_phys_to_virt(i)]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(msrs->controls[i].addr, val); } op_amd_start_ibs(); @@ -344,11 +330,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * pm callback */ for (i = 0; i < NUM_COUNTERS; ++i) { -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) -#else - if (!reset_value[i]) -#endif + if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0d07d23cb06..e874dc3565a 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -60,6 +60,7 @@ struct op_counter_config; extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); +extern int op_x86_phys_to_virt(int phys); extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; -- cgit v1.2.3-70-g09d2 From 7e7478c6bc0e011d2854b21f190cc3a1dba89905 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:09:53 +0200 Subject: oprofile: Grouping multiplexing code in op_model_amd.c This patch moves some multiplexing code to the new function op_mux_fill_in_addresses(). Also, the whole multiplexing code is now at a single location. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 75 ++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 67f830d12e0..644980f0392 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -74,6 +74,45 @@ static struct op_ibs_config ibs_config; #endif +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_VIRT_COUNTERS; i++) { + int hw_counter = i % NUM_COUNTERS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; + else + msrs->multiplex[i].addr = 0; + } +} + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!counter_config[virt].enabled) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} + +#else + +static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } + +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -94,15 +133,7 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) msrs->controls[i].addr = 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_COUNTERS; - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } -#endif + op_mux_fill_in_addresses(msrs); } static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, @@ -155,30 +186,6 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } } - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_amd_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#endif - - #ifdef CONFIG_OPROFILE_IBS static inline int @@ -535,6 +542,6 @@ struct op_x86_model_spec const op_amd_spec = { .stop = &op_amd_stop, .shutdown = &op_amd_shutdown, #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - .switch_ctrl = &op_amd_switch_ctrl, + .switch_ctrl = &op_mux_switch_ctrl, #endif }; -- cgit v1.2.3-70-g09d2 From 6ab82f958a5dca591a6ea17a3ca6f2aca06f4f2f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:40:04 +0200 Subject: x86/oprofile: Implement multiplexing setup/shutdown functions This patch implements nmi_setup_mux() and nmi_shutdown_mux() functions to setup/shutdown multiplexing. Multiplexing code in nmi_int.c is now much more separated. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 76 ++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 02b57b8d0e6..674fa37d150 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -106,9 +106,35 @@ inline int op_x86_phys_to_virt(int phys) return __get_cpu_var(switch_index) + phys; } +static void nmi_shutdown_mux(void) +{ + int i; + for_each_possible_cpu(i) { + kfree(per_cpu(cpu_msrs, i).multiplex); + per_cpu(cpu_msrs, i).multiplex = NULL; + per_cpu(switch_index, i) = 0; + } +} + +static int nmi_setup_mux(void) +{ + size_t multiplex_size = + sizeof(struct op_msr) * model->num_virt_counters; + int i; + for_each_possible_cpu(i) { + per_cpu(cpu_msrs, i).multiplex = + kmalloc(multiplex_size, GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).multiplex) + return 0; + } + return 1; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } +static inline void nmi_shutdown_mux(void) { } +static inline int nmi_setup_mux(void) { return 1; } #endif @@ -120,51 +146,27 @@ static void free_msrs(void) per_cpu(cpu_msrs, i).counters = NULL; kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - kfree(per_cpu(cpu_msrs, i).multiplex); - per_cpu(cpu_msrs, i).multiplex = NULL; -#endif } } static int allocate_msrs(void) { - int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; -#endif int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).counters) { - success = 0; - break; - } + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).counters) + return 0; per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).controls) { - success = 0; - break; - } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); - if (!per_cpu(cpu_msrs, i).multiplex) { - success = 0; - break; - } -#endif + GFP_KERNEL); + if (!per_cpu(cpu_msrs, i).controls) + return 0; } - if (!success) - free_msrs(); - - return success; + return 1; } #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX @@ -218,11 +220,15 @@ static int nmi_setup(void) int cpu; if (!allocate_msrs()) - return -ENOMEM; + err = -ENOMEM; + else if (!nmi_setup_mux()) + err = -ENOMEM; + else + err = register_die_notifier(&profile_exceptions_nb); - err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + nmi_shutdown_mux(); return err; } @@ -314,9 +320,6 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - per_cpu(switch_index, cpu) = 0; -#endif } static void nmi_shutdown(void) @@ -326,6 +329,7 @@ static void nmi_shutdown(void) nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + nmi_shutdown_mux(); msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); -- cgit v1.2.3-70-g09d2 From 48fb4b46712c7d3e8adc79826311abd9ccbf7f1d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_setup_cpu_mux() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 674fa37d150..b1edfc922e7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -130,11 +130,30 @@ static int nmi_setup_mux(void) return 1; } +static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) +{ + int i; + struct op_msr *multiplex = msrs->multiplex; + + for (i = 0; i < model->num_virt_counters; ++i) { + if (counter_config[i].enabled) { + multiplex[i].saved = -(u64)counter_config[i].count; + } else { + multiplex[i].addr = 0; + multiplex[i].saved = 0; + } + } + + per_cpu(switch_index, cpu) = 0; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } +static inline void +nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } #endif @@ -169,32 +188,6 @@ static int allocate_msrs(void) return 1; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) -{ - int i; - struct op_msr *multiplex = msrs->multiplex; - - for (i = 0; i < model->num_virt_counters; ++i) { - if (counter_config[i].enabled) { - multiplex[i].saved = -(u64)counter_config[i].count; - } else { - multiplex[i].addr = 0; - multiplex[i].saved = 0; - } - } - - per_cpu(switch_index, cpu) = 0; -} - -#else - -static inline void -nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } - -#endif - static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); -- cgit v1.2.3-70-g09d2 From d0f585dd20010f8479e56b5c6f391ef18e26877e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_cpu_save/restore_mpx_registers() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 52 +++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b1edfc922e7..f38c5cf0fdb 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -147,6 +147,30 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) per_cpu(switch_index, cpu) = 0; } +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + struct op_msr *multiplex = msrs->multiplex; + int i; + + for (i = 0; i < model->num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (multiplex[virt].addr) + wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + } +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -252,34 +276,6 @@ static int nmi_setup(void) return 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - struct op_msr *multiplex = msrs->multiplex; - int i; - - for (i = 0; i < model->num_counters; ++i) { - int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); - } -} - -#endif - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; -- cgit v1.2.3-70-g09d2 From b28d1b923ab52d535c0719155dccf3b3d98bab9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:38:49 +0200 Subject: x86/oprofile: Moving nmi_cpu_switch() in nmi_int.c This patch moves some code in nmi_int.c to get a single separate multiplexing code section. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 144 +++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f38c5cf0fdb..998c7dca31e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -97,6 +97,29 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs) } } +static void nmi_cpu_start(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->start(msrs); +} + +static int nmi_start(void) +{ + on_each_cpu(nmi_cpu_start, NULL, 1); + return 0; +} + +static void nmi_cpu_stop(void *dummy) +{ + struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); + model->stop(msrs); +} + +static void nmi_stop(void) +{ + on_each_cpu(nmi_cpu_stop, NULL, 1); +} + #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX static DEFINE_PER_CPU(int, switch_index); @@ -171,6 +194,53 @@ static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) } } +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_counters; + if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, cpu) = 0; + else + per_cpu(switch_index, cpu) = si; + + model->switch_ctrl(model, msrs); + nmi_cpu_restore_mpx_registers(msrs); + + nmi_cpu_start(NULL); +} + + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (!model->switch_ctrl) + return -ENOSYS; /* not implemented */ + if (nmi_multiplex_on() < 0) + return -EINVAL; /* not necessary */ + + on_each_cpu(nmi_cpu_switch, NULL, 1); + + atomic_inc(&multiplex_counter); + + return 0; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -325,29 +395,6 @@ static void nmi_shutdown(void) put_cpu_var(cpu_msrs); } -static void nmi_cpu_start(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); -} - -static int nmi_start(void) -{ - on_each_cpu(nmi_cpu_start, NULL, 1); - return 0; -} - -static void nmi_cpu_stop(void *dummy) -{ - struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); -} - -static void nmi_stop(void) -{ - on_each_cpu(nmi_cpu_stop, NULL, 1); -} - static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; @@ -379,57 +426,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, cpu) = 0; - else - per_cpu(switch_index, cpu) = si; - - model->switch_ctrl(model, msrs); - nmi_cpu_restore_mpx_registers(msrs); - - nmi_cpu_start(NULL); -} - - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (!model->switch_ctrl) - return -ENOSYS; /* not implemented */ - if (nmi_multiplex_on() < 0) - return -EINVAL; /* not necessary */ - - on_each_cpu(nmi_cpu_switch, NULL, 1); - - atomic_inc(&multiplex_counter); - - return 0; -} - -#endif - #ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) -- cgit v1.2.3-70-g09d2 From 259a83a8abdb9d2664819ec80ad12ebaeb251e32 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 15:12:35 +0200 Subject: x86/oprofile: Remove const qualifier from struct op_x86_model_spec This patch removes the const qualifier from struct op_x86_model_spec to make model parameters changable. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 ++-- arch/x86/oprofile/op_model_amd.c | 2 +- arch/x86/oprofile/op_model_p4.c | 4 ++-- arch/x86/oprofile/op_model_ppro.c | 2 +- arch/x86/oprofile/op_x86_model.h | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 998c7dca31e..826f391b422 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -27,7 +27,7 @@ #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const *model; +static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); @@ -542,7 +542,7 @@ module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - struct op_x86_model_spec const *spec = &op_ppro_spec; /* default */ + struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 644980f0392..39604b429d6 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -526,7 +526,7 @@ static void op_amd_exit(void) {} #endif /* CONFIG_OPROFILE_IBS */ -struct op_x86_model_spec const op_amd_spec = { +struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .num_virt_counters = NUM_VIRT_COUNTERS, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 65b9237cde8..40df028d0d9 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -695,7 +695,7 @@ static void p4_shutdown(struct op_msrs const * const msrs) #ifdef CONFIG_SMP -struct op_x86_model_spec const op_p4_ht2_spec = { +struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .num_virt_counters = NUM_COUNTERS_HT2, @@ -709,7 +709,7 @@ struct op_x86_model_spec const op_p4_ht2_spec = { }; #endif -struct op_x86_model_spec const op_p4_spec = { +struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .num_virt_counters = NUM_COUNTERS_NON_HT, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 098cbca5c0b..659f3b6f86f 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -203,7 +203,7 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec const op_ppro_spec = { +struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, .num_virt_counters = 2, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index e874dc3565a..0c886fa0369 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -62,10 +62,10 @@ extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); -extern struct op_x86_model_spec const op_ppro_spec; -extern struct op_x86_model_spec const op_p4_spec; -extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_ppro_spec; +extern struct op_x86_model_spec op_p4_spec; +extern struct op_x86_model_spec op_p4_ht2_spec; +extern struct op_x86_model_spec op_amd_spec; extern struct op_x86_model_spec op_arch_perfmon_spec; #endif /* OP_X86_MODEL_H */ -- cgit v1.2.3-70-g09d2 From 2904a527575344a804fdd82b1f8d09a8731d8d49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 12:33:41 +0200 Subject: x86/oprofile: Remove unused num_virt_controls from struct op_x86_model_spec The member num_virt_controls of struct op_x86_model_spec is not used. This patch removes it. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 1 - arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 1 - arch/x86/oprofile/op_x86_model.h | 1 - 4 files changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 39604b429d6..dce69b5979e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -530,7 +530,6 @@ struct op_x86_model_spec op_amd_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .num_virt_counters = NUM_VIRT_COUNTERS, - .num_virt_controls = NUM_VIRT_CONTROLS, .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 40df028d0d9..0a4f2deb9e8 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -699,7 +699,6 @@ struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, .num_virt_counters = NUM_COUNTERS_HT2, - .num_virt_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -713,7 +712,6 @@ struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, .num_virt_counters = NUM_COUNTERS_NON_HT, - .num_virt_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 659f3b6f86f..753a02ab215 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -207,7 +207,6 @@ struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, .num_virt_counters = 2, - .num_virt_controls = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0c886fa0369..4e2e7c2c519 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -37,7 +37,6 @@ struct op_x86_model_spec { unsigned int num_counters; unsigned int num_controls; unsigned int num_virt_counters; - unsigned int num_virt_controls; u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); -- cgit v1.2.3-70-g09d2 From 52471c67ee2fa5ed6f700ef57bf27833c63b2192 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 6 Jul 2009 14:43:55 +0200 Subject: x86/oprofile: Modify initialization of num_virt_counters Models that do not yet support counter multiplexing have to setup num_virt_counters. This patch implements the setup from num_counters if num_virt_counters is not set. Thus, num_virt_counters must be setup only for multiplexing support. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 3 +++ arch/x86/oprofile/op_model_p4.c | 2 -- arch/x86/oprofile/op_model_ppro.c | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 826f391b422..82ee29517f1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -674,6 +674,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (ret) return ret; + if (!model->num_virt_counters) + model->num_virt_counters = model->num_counters; + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 0a4f2deb9e8..ac6b354becd 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,7 +698,6 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, - .num_virt_counters = NUM_COUNTERS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -711,7 +710,6 @@ struct op_x86_model_spec op_p4_ht2_spec = { struct op_x86_model_spec op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, - .num_virt_counters = NUM_COUNTERS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 753a02ab215..4899215999d 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -206,7 +206,6 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, - .num_virt_counters = 2, .reserved = MSR_PPRO_EVENTSEL_RESERVED, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, -- cgit v1.2.3-70-g09d2 From 39e97f40c3a5e71de0532368deaa683e09b74ba2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 15:11:45 +0200 Subject: x86/oprofile: Add function has_mux() to check multiplexing support The check is used to prevent running multiplexing code for models not supporting multiplexing. Before, the code was running but without effect. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 82ee29517f1..dca7240aeb2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -124,6 +124,11 @@ static void nmi_stop(void) static DEFINE_PER_CPU(int, switch_index); +static inline int has_mux(void) +{ + return !!model->switch_ctrl; +} + inline int op_x86_phys_to_virt(int phys) { return __get_cpu_var(switch_index) + phys; @@ -132,6 +137,10 @@ inline int op_x86_phys_to_virt(int phys) static void nmi_shutdown_mux(void) { int i; + + if (!has_mux()) + return; + for_each_possible_cpu(i) { kfree(per_cpu(cpu_msrs, i).multiplex); per_cpu(cpu_msrs, i).multiplex = NULL; @@ -144,12 +153,17 @@ static int nmi_setup_mux(void) size_t multiplex_size = sizeof(struct op_msr) * model->num_virt_counters; int i; + + if (!has_mux()) + return 1; + for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = kmalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } + return 1; } @@ -158,6 +172,9 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) int i; struct op_msr *multiplex = msrs->multiplex; + if (!has_mux()) + return; + for (i = 0; i < model->num_virt_counters; ++i) { if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; @@ -229,7 +246,7 @@ static int nmi_multiplex_on(void) static int nmi_switch_event(void) { - if (!model->switch_ctrl) + if (!has_mux()) return -ENOSYS; /* not implemented */ if (nmi_multiplex_on() < 0) return -EINVAL; /* not necessary */ -- cgit v1.2.3-70-g09d2 From 5280514471c2803776701c43c027038decac1103 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 16:02:44 +0200 Subject: x86/oprofile: Enable multiplexing only if the model supports it This patch checks if the model supports multiplexing. Only then multiplexing will be enabled. The code is added to the common x86 initialization. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index dca7240aeb2..f0fb44725d8 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -258,6 +258,12 @@ static int nmi_switch_event(void) return 0; } +static inline void mux_init(struct oprofile_operations *ops) +{ + if (has_mux()) + ops->switch_events = nmi_switch_event; +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -265,6 +271,7 @@ static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } static inline void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } +static inline void mux_init(struct oprofile_operations *ops) { } #endif @@ -682,9 +689,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - ops->switch_events = nmi_switch_event; -#endif if (model->init) ret = model->init(ops); @@ -694,6 +698,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!model->num_virt_counters) model->num_virt_counters = model->num_counters; + mux_init(ops); + init_sysfs(); using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); -- cgit v1.2.3-70-g09d2 From 4d015f79e972cea1761cfee8872b1c0992ccd8b2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 21:42:51 +0200 Subject: x86/oprofile: Implement mux_clone() To setup a counter for all cpus, its structure is cloned from cpu 0. This patch implements mux_clone() to do this part for multiplexing data. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f0fb44725d8..da6d2ab31c6 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -264,6 +264,16 @@ static inline void mux_init(struct oprofile_operations *ops) ops->switch_events = nmi_switch_event; } +static void mux_clone(int cpu) +{ + if (!has_mux()) + return; + + memcpy(per_cpu(cpu_msrs, cpu).multiplex, + per_cpu(cpu_msrs, 0).multiplex, + sizeof(struct op_msr) * model->num_virt_counters); +} + #else inline int op_x86_phys_to_virt(int phys) { return phys; } @@ -272,6 +282,7 @@ static inline int nmi_setup_mux(void) { return 1; } static inline void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs) { } static inline void mux_init(struct oprofile_operations *ops) { } +static void mux_clone(int cpu) { } #endif @@ -350,20 +361,18 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); for_each_possible_cpu(cpu) { - if (cpu != 0) { - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - memcpy(per_cpu(cpu_msrs, cpu).multiplex, - per_cpu(cpu_msrs, 0).multiplex, - sizeof(struct op_msr) * model->num_virt_counters); -#endif - } + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + + mux_clone(cpu); } on_each_cpu(nmi_cpu_setup, NULL, 1); nmi_enabled = 1; -- cgit v1.2.3-70-g09d2 From 1b294f5960cd89e49eeb3e797860c552b03f2272 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 9 Jul 2009 14:56:25 +0200 Subject: oprofile: Adding switch counter to oprofile statistic variables This patch moves the multiplexing switch counter from x86 code to common oprofile statistic variables. Now the value will be available and usable for all architectures. The initialization and incrementation also moved to common code. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 7 ------- drivers/oprofile/oprof.c | 7 +++++-- drivers/oprofile/oprofile_stats.c | 9 ++------- drivers/oprofile/oprofile_stats.h | 1 + 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index da6d2ab31c6..7b3362f9abd 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -34,11 +34,6 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -extern atomic_t multiplex_counter; -#endif - struct op_counter_config counter_config[OP_MAX_COUNTER]; /* common functions */ @@ -253,8 +248,6 @@ static int nmi_switch_event(void) on_each_cpu(nmi_cpu_switch, NULL, 1); - atomic_inc(&multiplex_counter); - return 0; } diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index a48294a8ebe..dc8a0428260 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -107,8 +107,11 @@ static void stop_switch_worker(void) static void switch_worker(struct work_struct *work) { - if (!oprofile_ops.switch_events()) - start_switch_worker(); + if (oprofile_ops.switch_events()) + return; + + atomic_inc(&oprofile_stats.multiplex_counter); + start_switch_worker(); } /* User inputs in ms, converts to jiffies */ diff --git a/drivers/oprofile/oprofile_stats.c b/drivers/oprofile/oprofile_stats.c index 77a57a6792f..61689e814d4 100644 --- a/drivers/oprofile/oprofile_stats.c +++ b/drivers/oprofile/oprofile_stats.c @@ -16,9 +16,6 @@ #include "cpu_buffer.h" struct oprofile_stat_struct oprofile_stats; -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -atomic_t multiplex_counter; -#endif void oprofile_reset_stats(void) { @@ -37,9 +34,7 @@ void oprofile_reset_stats(void) atomic_set(&oprofile_stats.sample_lost_no_mapping, 0); atomic_set(&oprofile_stats.event_lost_overflow, 0); atomic_set(&oprofile_stats.bt_lost_no_mapping, 0); -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - atomic_set(&multiplex_counter, 0); -#endif + atomic_set(&oprofile_stats.multiplex_counter, 0); } @@ -84,6 +79,6 @@ void oprofile_create_stats_files(struct super_block *sb, struct dentry *root) &oprofile_stats.bt_lost_no_mapping); #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX oprofilefs_create_ro_atomic(sb, dir, "multiplex_counter", - &multiplex_counter); + &oprofile_stats.multiplex_counter); #endif } diff --git a/drivers/oprofile/oprofile_stats.h b/drivers/oprofile/oprofile_stats.h index 3da0d08dc1f..0b54e46c3c1 100644 --- a/drivers/oprofile/oprofile_stats.h +++ b/drivers/oprofile/oprofile_stats.h @@ -17,6 +17,7 @@ struct oprofile_stat_struct { atomic_t sample_lost_no_mapping; atomic_t bt_lost_no_mapping; atomic_t event_lost_overflow; + atomic_t multiplex_counter; }; extern struct oprofile_stat_struct oprofile_stats; -- cgit v1.2.3-70-g09d2 From 61d149d5248ad7428801cdede0f5fcc2b90cd61c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 10 Jul 2009 15:47:17 +0200 Subject: x86/oprofile: Implement op_x86_virt_to_phys() This patch implements a common x86 function to convert virtual counter numbers to physical. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 6 ++++++ arch/x86/oprofile/op_model_amd.c | 2 +- arch/x86/oprofile/op_x86_model.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7b3362f9abd..5856e61cb09 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -129,6 +129,11 @@ inline int op_x86_phys_to_virt(int phys) return __get_cpu_var(switch_index) + phys; } +inline int op_x86_virt_to_phys(int virt) +{ + return virt % model->num_counters; +} + static void nmi_shutdown_mux(void) { int i; @@ -270,6 +275,7 @@ static void mux_clone(int cpu) #else inline int op_x86_phys_to_virt(int phys) { return phys; } +inline int op_x86_virt_to_phys(int virt) { return virt; } static inline void nmi_shutdown_mux(void) { } static inline int nmi_setup_mux(void) { return 1; } static inline void diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index dce69b5979e..1ea19829d98 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -81,7 +81,7 @@ static void op_mux_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = i % NUM_COUNTERS; + int hw_counter = op_x86_virt_to_phys(i); if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; else diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 4e2e7c2c519..b83776180c7 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -60,6 +60,7 @@ struct op_counter_config; extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); +extern int op_x86_virt_to_phys(int virt); extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec op_p4_spec; -- cgit v1.2.3-70-g09d2 From 11be1a7b54283021777f409aa983ce125945e67c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 10 Jul 2009 18:15:21 +0200 Subject: x86/oprofile: Add counter reservation check for virtual counters This patch adds a check for the availability of a counter. A virtual counter is used only if its physical counter is not reserved. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 5856e61cb09..cb88b1a0bd5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -435,15 +435,13 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) struct dentry *dir; char buf[4]; -#ifndef CONFIG_OPROFILE_EVENT_MULTIPLEX /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). */ - if (unlikely(!avail_to_resrv_perfctr_nmi_bit(i))) + if (!avail_to_resrv_perfctr_nmi_bit(op_x86_virt_to_phys(i))) continue; -#endif /* CONFIG_OPROFILE_EVENT_MULTIPLEX */ snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); -- cgit v1.2.3-70-g09d2 From c550091edd6fac2ed9dac1b30d986b6c58b216fa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Jul 2009 13:11:16 +0200 Subject: x86/oprofile: Small coding style fixes Some small coding style fixes. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 1ea19829d98..827beecb67a 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -144,11 +144,10 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, /* setup reset_value */ for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) { + if (counter_config[i].enabled) reset_value[i] = counter_config[i].count; - } else { + else reset_value[i] = 0; - } } /* clear all counters */ -- cgit v1.2.3-70-g09d2 From 3162534069597e34dd0ac9eb711be8dc23835ae7 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:30:59 -0700 Subject: x86, intel_txt: Intel TXT boot support This patch adds kernel configuration and boot support for Intel Trusted Execution Technology (Intel TXT). Intel's technology for safer computing, Intel Trusted Execution Technology (Intel TXT), defines platform-level enhancements that provide the building blocks for creating trusted platforms. Intel TXT was formerly known by the code name LaGrande Technology (LT). Intel TXT in Brief: o Provides dynamic root of trust for measurement (DRTM) o Data protection in case of improper shutdown o Measurement and verification of launched environment Intel TXT is part of the vPro(TM) brand and is also available some non-vPro systems. It is currently available on desktop systems based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, PM45, and GS45 Express chipsets. For more information, see http://www.intel.com/technology/security/. This site also has a link to the Intel TXT MLE Developers Manual, which has been updated for the new released platforms. A much more complete description of how these patches support TXT, how to configure a system for it, etc. is in the Documentation/intel_txt.txt file in this patch. This patch provides the TXT support routines for complete functionality, documentation for TXT support and for the changes to the boot_params structure, and boot detection of a TXT launch. Attempts to shutdown (reboot, Sx) the system will result in platform resets; subsequent patches will support these shutdown modes properly. Documentation/intel_txt.txt | 210 +++++++++++++++++++++ Documentation/x86/zero-page.txt | 1 arch/x86/include/asm/bootparam.h | 3 arch/x86/include/asm/fixmap.h | 3 arch/x86/include/asm/tboot.h | 197 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 arch/x86/kernel/setup.c | 4 arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++++++++++ security/Kconfig | 30 +++ 9 files changed, 827 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: Gang Wei Signed-off-by: H. Peter Anvin --- Documentation/intel_txt.txt | 210 ++++++++++++++++++++++ Documentation/x86/zero-page.txt | 1 + arch/x86/include/asm/bootparam.h | 3 +- arch/x86/include/asm/fixmap.h | 3 + arch/x86/include/asm/tboot.h | 197 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/setup.c | 4 + arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++++++++++ security/Kconfig | 30 ++++ 9 files changed, 827 insertions(+), 1 deletion(-) create mode 100644 Documentation/intel_txt.txt create mode 100644 arch/x86/include/asm/tboot.h create mode 100644 arch/x86/kernel/tboot.c (limited to 'arch/x86') diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt new file mode 100644 index 00000000000..f40a1f03001 --- /dev/null +++ b/Documentation/intel_txt.txt @@ -0,0 +1,210 @@ +Intel(R) TXT Overview: +===================== + +Intel's technology for safer computing, Intel(R) Trusted Execution +Technology (Intel(R) TXT), defines platform-level enhancements that +provide the building blocks for creating trusted platforms. + +Intel TXT was formerly known by the code name LaGrande Technology (LT). + +Intel TXT in Brief: +o Provides dynamic root of trust for measurement (DRTM) +o Data protection in case of improper shutdown +o Measurement and verification of launched environment + +Intel TXT is part of the vPro(TM) brand and is also available some +non-vPro systems. It is currently available on desktop systems +based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell +Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, +PM45, and GS45 Express chipsets. + +For more information, see http://www.intel.com/technology/security/. +This site also has a link to the Intel TXT MLE Developers Manual, +which has been updated for the new released platforms. + +Intel TXT has been presented at various events over the past few +years, some of which are: + LinuxTAG 2008: + http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/ + details.html?talkid=110 + TRUST2008: + http://www.trust2008.eu/downloads/Keynote-Speakers/ + 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf + IDF 2008, Shanghai: + http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/ + aep/PROS003/index.html + IDFs 2006, 2007 (I'm not sure if/where they are online) + +Trusted Boot Project Overview: +============================= + +Trusted Boot (tboot) is an open source, pre- kernel/VMM module that +uses Intel TXT to perform a measured and verified launch of an OS +kernel/VMM. + +It is hosted on SourceForge at http://sourceforge.net/projects/tboot. +The mercurial source repo is available at http://www.bughost.org/ +repos.hg/tboot.hg. + +Tboot currently supports launching Xen (open source VMM/hypervisor +w/ TXT support since v3.2), and now Linux kernels. + + +Value Proposition for Linux or "Why should you care?" +===================================================== + +While there are many products and technologies that attempt to +measure or protect the integrity of a running kernel, they all +assume the kernel is "good" to begin with. The Integrity +Measurement Architecture (IMA) and Linux Integrity Module interface +are examples of such solutions. + +To get trust in the initial kernel without using Intel TXT, a +static root of trust must be used. This bases trust in BIOS +starting at system reset and requires measurement of all code +executed between system reset through the completion of the kernel +boot as well as data objects used by that code. In the case of a +Linux kernel, this means all of BIOS, any option ROMs, the +bootloader and the boot config. In practice, this is a lot of +code/data, much of which is subject to change from boot to boot +(e.g. changing NICs may change option ROMs). Without reference +hashes, these measurement changes are difficult to assess or +confirm as benign. This process also does not provide DMA +protection, memory configuration/alias checks and locks, crash +protection, or policy support. + +By using the hardware-based root of trust that Intel TXT provides, +many of these issues can be mitigated. Specifically: many +pre-launch components can be removed from the trust chain, DMA +protection is provided to all launched components, a large number +of platform configuration checks are performed and values locked, +protection is provided for any data in the event of an improper +shutdown, and there is support for policy-based execution/verification. +This provides a more stable measurement and a higher assurance of +system configuration and initial state than would be otherwise +possible. Since the tboot project is open source, source code for +almost all parts of the trust chain is available (excepting SMM and +Intel-provided firmware). + +How Does it Work? +================= + +o Tboot is an executable that is launched by the bootloader as + the "kernel" (the binary the bootloader executes). +o It performs all of the work necessary to determine if the + platform supports Intel TXT and, if so, executes the GETSEC[SENTER] + processor instruction that initiates the dynamic root of trust. + - If tboot determines that the system does not support Intel TXT + or is not configured correctly (e.g. the SINIT AC Module was + incorrect), it will directly launch the kernel with no changes + to any state. + - Tboot will output various information about its progress to the + terminal, serial port, and/or an in-memory log; the output + locations can be configured with a command line switch. +o The GETSEC[SENTER] instruction will return control to tboot and + tboot then verifies certain aspects of the environment (e.g. TPM NV + lock, e820 table does not have invalid entries, etc.). +o It will wake the APs from the special sleep state the GETSEC[SENTER] + instruction had put them in and place them into a wait-for-SIPI + state. + - Because the processors will not respond to an INIT or SIPI when + in the TXT environment, it is necessary to create a small VT-x + guest for the APs. When they run in this guest, they will + simply wait for the INIT-SIPI-SIPI sequence, which will cause + VMEXITs, and then disable VT and jump to the SIPI vector. This + approach seemed like a better choice than having to insert + special code into the kernel's MP wakeup sequence. +o Tboot then applies an (optional) user-defined launch policy to + verify the kernel and initrd. + - This policy is rooted in TPM NV and is described in the tboot + project. The tboot project also contains code for tools to + create and provision the policy. + - Policies are completely under user control and if not present + then any kernel will be launched. + - Policy action is flexible and can include halting on failures + or simply logging them and continuing. +o Tboot adjusts the e820 table provided by the bootloader to reserve + its own location in memory as well as to reserve certain other + TXT-related regions. +o As part of it's launch, tboot DMA protects all of RAM (using the + VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' + in order to remove this blanket protection and use VT-d's + page-level protection. +o Tboot will populate a shared page with some data about itself and + pass this to the Linux kernel as it transfers control. + - The location of the shared page is passed via the boot_params + struct as a physical address. +o The kernel will look for the tboot shared page address and, if it + exists, map it. +o As one of the checks/protections provided by TXT, it makes a copy + of the VT-d DMARs in a DMA-protected region of memory and verifies + them for correctness. The VT-d code will detect if the kernel was + launched with tboot and use this copy instead of the one in the + ACPI table. +o At this point, tboot and TXT are out of the picture until a + shutdown (S) +o In order to put a system into any of the sleep states after a TXT + launch, TXT must first be exited. This is to prevent attacks that + attempt to crash the system to gain control on reboot and steal + data left in memory. + - The kernel will perform all of its sleep preparation and + populate the shared page with the ACPI data needed to put the + platform in the desired sleep state. + - Then the kernel jumps into tboot via the vector specified in the + shared page. + - Tboot will clean up the environment and disable TXT, then use the + kernel-provided ACPI information to actually place the platform + into the desired sleep state. + - In the case of S3, tboot will also register itself as the resume + vector. This is necessary because it must re-establish the + measured environment upon resume. Once the TXT environment + has been restored, it will restore the TPM PCRs and then + transfer control back to the kernel's S3 resume vector. + In order to preserve system integrity across S3, the kernel + provides tboot with a set of memory ranges (kernel + code/data/bss, S3 resume code, and AP trampoline) that tboot + will calculate a MAC (message authentication code) over and then + seal with the TPM. On resume and once the measured environment + has been re-established, tboot will re-calculate the MAC and + verify it against the sealed value. Tboot's policy determines + what happens if the verification fails. + +That's pretty much it for TXT support. + + +Configuring the System: +====================== + +This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels. + +In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes +allow these to be individually enabled/disabled and the screens in +which to find them are BIOS-specific. + +grub.conf needs to be modified as follows: + title Linux 2.6.29-tip w/ tboot + root (hd0,0) + kernel /tboot.gz logging=serial,vga,memory + module /vmlinuz-2.6.29-tip intel_iommu=on ro + root=LABEL=/ rhgb console=ttyS0,115200 3 + module /initrd-2.6.29-tip.img + module /Q35_SINIT_17.BIN + +The kernel option for enabling Intel TXT support is found under the +Security top-level menu and is called "Enable Intel(R) Trusted +Execution Technology (TXT)". It is marked as EXPERIMENTAL and +depends on the generic x86 support (to allow maximum flexibility in +kernel build options), since the tboot code will detect whether the +platform actually supports Intel TXT and thus whether any of the +kernel code is executed. + +The Q35_SINIT_17.BIN file is what Intel TXT refers to as an +Authenticated Code Module. It is specific to the chipset in the +system and can also be found on the Trusted Boot site. It is an +(unencrypted) module signed by Intel that is used as part of the +DRTM process to verify and configure the system. It is signed +because it operates at a higher privilege level in the system than +any other macrocode and its correct operation is critical to the +establishment of the DRTM. The process for determining the correct +SINIT ACM for a system is documented in the SINIT-guide.txt file +that is on the tboot SourceForge site under the SINIT ACM downloads. diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 4f913857b8a..feb37e17701 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -12,6 +12,7 @@ Offset Proto Name Meaning 000/040 ALL screen_info Text mode or frame buffer information (struct screen_info) 040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) +058/008 ALL tboot_addr Physical address of tboot shared page 060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information (struct ist_info) 080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..6ca20218dd7 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -85,7 +85,8 @@ struct efi_info { struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - __u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[4]; /* 0x054 */ + __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a..14f9890eb49 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -131,6 +131,9 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_32 FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, #endif __end_of_fixed_addresses }; diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h new file mode 100644 index 00000000000..b13929d4e5f --- /dev/null +++ b/arch/x86/include/asm/tboot.h @@ -0,0 +1,197 @@ +/* + * tboot.h: shared data structure with tboot and kernel and functions + * used by kernel for runtime support of Intel(R) Trusted + * Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef _ASM_TBOOT_H +#define _ASM_TBOOT_H + +#include + +/* these must have the values from 0-5 in this order */ +enum { + TB_SHUTDOWN_REBOOT = 0, + TB_SHUTDOWN_S5, + TB_SHUTDOWN_S4, + TB_SHUTDOWN_S3, + TB_SHUTDOWN_HALT, + TB_SHUTDOWN_WFS +}; + +#ifdef CONFIG_INTEL_TXT + +/* used to communicate between tboot and the launched kernel */ + +#define TB_KEY_SIZE 64 /* 512 bits */ + +#define MAX_TB_MAC_REGIONS 32 + +struct tboot_mac_region { + u64 start; /* must be 64 byte -aligned */ + u32 size; /* must be 64 byte -granular */ +} __packed; + +/* GAS - Generic Address Structure (ACPI 2.0+) */ +struct tboot_acpi_generic_address { + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_width; + u64 address; +} __packed; + +/* + * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec + * (http://www.acpi.info/) + */ +struct tboot_acpi_sleep_info { + struct tboot_acpi_generic_address pm1a_cnt_blk; + struct tboot_acpi_generic_address pm1b_cnt_blk; + struct tboot_acpi_generic_address pm1a_evt_blk; + struct tboot_acpi_generic_address pm1b_evt_blk; + u16 pm1a_cnt_val; + u16 pm1b_cnt_val; + u64 wakeup_vector; + u32 vector_width; + u64 kernel_s3_resume_vector; +} __packed; + +/* + * shared memory page used for communication between tboot and kernel + */ +struct tboot { + /* + * version 3+ fields: + */ + + /* TBOOT_UUID */ + u8 uuid[16]; + + /* version number: 5 is current */ + u32 version; + + /* physical addr of tb_log_t log */ + u32 log_addr; + + /* + * physical addr of entry point for tboot shutdown and + * type of shutdown (TB_SHUTDOWN_*) being requested + */ + u32 shutdown_entry; + u32 shutdown_type; + + /* kernel-specified ACPI info for Sx shutdown */ + struct tboot_acpi_sleep_info acpi_sinfo; + + /* tboot location in memory (physical) */ + u32 tboot_base; + u32 tboot_size; + + /* memory regions (phys addrs) for tboot to MAC on S3 */ + u8 num_mac_regions; + struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; + + + /* + * version 4+ fields: + */ + + /* symmetric key for use by kernel; will be encrypted on S3 */ + u8 s3_key[TB_KEY_SIZE]; + + + /* + * version 5+ fields: + */ + + /* used to 4byte-align num_in_wfs */ + u8 reserved_align[3]; + + /* number of processors in wait-for-SIPI */ + u32 num_in_wfs; +} __packed; + +/* + * UUID for tboot data struct to facilitate matching + * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is + * represented as {} in the char array used here + */ +#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ + 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} + +extern struct tboot *tboot; + +static inline int tboot_enabled(void) +{ + return tboot != NULL; +} + +extern void tboot_probe(void); +extern void tboot_create_trampoline(void); +extern void tboot_shutdown(u32 shutdown_type); +extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); +extern int tboot_wait_for_aps(int num_aps); +extern struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl); +extern int tboot_force_iommu(void); + +#else /* CONFIG_INTEL_TXT */ + +static inline int tboot_enabled(void) +{ + return 0; +} + +static inline void tboot_probe(void) +{ +} + +static inline void tboot_create_trampoline(void) +{ +} + +static inline void tboot_shutdown(u32 shutdown_type) +{ +} + +static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, + u32 pm1b_control) +{ +} + +static inline int tboot_wait_for_aps(int num_aps) +{ + return 0; +} + +static inline struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl) +{ + return dmar_tbl; +} + +static inline int tboot_force_iommu(void) +{ + return 0; +} + +#endif /* !CONFIG_INTEL_TXT */ + +#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..832cb838cb4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de2cab13284..80d6e9e3248 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -145,6 +145,8 @@ struct boot_params __initdata boot_params; struct boot_params boot_params; #endif +#include + /* * Machine setup.. */ @@ -964,6 +966,8 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); + tboot_probe(); + #ifdef CONFIG_X86_64 map_vsyscall(); #endif diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..263591afd29 --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,379 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + if (!tboot_enabled()) + return; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); +} + +static void set_mac_regions(void) +{ + tboot->num_mac_regions = 3; + /* S3 resume code */ + tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); + tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; + /* AP trampoline code */ + tboot->mac_regions[1].start = + PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); + tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; + /* kernel code + data + bss */ + tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - + PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); +} + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + set_mac_regions(); + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + if (!tboot_enabled()) + return 0; + + timeout = jiffies + AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + time_before(jiffies, timeout)) + cpu_relax(); + + return time_before(jiffies, timeout) ? 0 : 1; +} + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/security/Kconfig b/security/Kconfig index d23c839038f..edc7cbdc012 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,36 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config INTEL_TXT + bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)" + depends on EXPERIMENTAL && X86 && DMAR && ACPI + help + This option enables support for booting the kernel with the + Trusted Boot (tboot) module. This will utilize + Intel(R) Trusted Execution Technology to perform a measured launch + of the kernel. If the system does not support Intel(R) TXT, this + will have no effect. + + Intel TXT will provide higher assurance of sysem configuration and + initial state as well as data reset protection. This is used to + create a robust initial kernel measurement and verification, which + helps to ensure that kernel security mechanisms are functioning + correctly. This level of protection requires a root of trust outside + of the kernel itself. + + Intel TXT also helps solve real end user concerns about having + confidence that their hardware is running the VMM or kernel that + it was conigured with, especially since they may be responsible for + providing such assurances to VMs and services running on it. + + See for more information + about Intel(R) TXT. + See for more information about tboot. + See Documentation/intel_txt.txt for a description of how to enable + Intel TXT support in a kernel boot. + + If you are unsure as to whether this is required, answer N. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig -- cgit v1.2.3-70-g09d2 From 840c2baf2d4cdf35ecc3b7fcbba7740f97de30a4 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:02 -0700 Subject: x86, intel_txt: Intel TXT reboot/halt shutdown support Support for graceful handling of kernel reboots after an Intel(R) TXT launch. Without this patch, attempting to reboot or halt the system will cause the TXT hardware to lock memory upon system restart because the secrets-in-memory flag that was set on launch was never cleared. This will in turn cause BIOS to execute a TXT Authenticated Code Module (ACM) that will scrub all of memory and then unlock it. Depending on the amount of memory in the system and its type, this may take some time. This patch creates a 1:1 address mapping to the tboot module and then calls back into tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag. When it has completed these steps, the tboot module will reboot or halt the system. arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8170f..9de01c5d979 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,8 @@ # include #endif +#include + /* * Power off function, if any */ @@ -460,6 +462,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -586,6 +590,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -597,6 +603,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/init/main.c b/init/main.c index 2c5ade79eb8..56ada27c4f4 100644 --- a/init/main.c +++ b/init/main.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -715,6 +716,8 @@ asmlinkage void __init start_kernel(void) ftrace_init(); + tboot_create_trampoline(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } -- cgit v1.2.3-70-g09d2 From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..61cc40887c4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1317,6 +1318,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index db307a356f0..8c01dd3724e 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -45,6 +45,7 @@ #include #include "accommon.h" #include "actables.h" +#include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..ff071e022a8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ -- cgit v1.2.3-70-g09d2 From d7aacaddcac3971e33cf52d7e610c06696cb347f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 8 Jul 2009 13:21:31 +0200 Subject: Driver Core: Add platform device arch data V3 Allow architecture specific data in struct platform_device V3. With this patch struct pdev_archdata is added to struct platform_device, similar to struct dev_archdata in found in struct device. Useful for architecture code that needs to keep extra data associated with each platform device. Struct pdev_archdata is different from dev.platform_data, the convention is that dev.platform_data points to driver-specific data. It may or may not be required by the driver. The format of this depends on driver but is the same across architectures. The structure pdev_archdata is a place for architecture specific data. This data is handled by architecture specific code (for example runtime PM), and since it is architecture specific it should _never_ be touched by device driver code. Exactly like struct dev_archdata but for platform devices. [rjw: This change is for power management mostly and that's why it goes through the suspend tree.] Signed-off-by: Magnus Damm Acked-by: Kevin Hilman Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- arch/arm/include/asm/device.h | 3 +++ arch/ia64/include/asm/device.h | 3 +++ arch/microblaze/include/asm/device.h | 3 +++ arch/powerpc/include/asm/device.h | 3 +++ arch/sparc/include/asm/device.h | 3 +++ arch/x86/include/asm/device.h | 3 +++ include/asm-generic/device.h | 3 +++ include/linux/platform_device.h | 3 +++ 8 files changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/device.h b/arch/arm/include/asm/device.h index c61642b4060..9f390ce335c 100644 --- a/arch/arm/include/asm/device.h +++ b/arch/arm/include/asm/device.h @@ -12,4 +12,7 @@ struct dev_archdata { #endif }; +struct pdev_archdata { +}; + #endif diff --git a/arch/ia64/include/asm/device.h b/arch/ia64/include/asm/device.h index 41ab85d66f3..d66d446b127 100644 --- a/arch/ia64/include/asm/device.h +++ b/arch/ia64/include/asm/device.h @@ -15,4 +15,7 @@ struct dev_archdata { #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_IA64_DEVICE_H */ diff --git a/arch/microblaze/include/asm/device.h b/arch/microblaze/include/asm/device.h index c042830793e..30286db27c1 100644 --- a/arch/microblaze/include/asm/device.h +++ b/arch/microblaze/include/asm/device.h @@ -16,6 +16,9 @@ struct dev_archdata { struct device_node *of_node; }; +struct pdev_archdata { +}; + #endif /* _ASM_MICROBLAZE_DEVICE_H */ diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 7d2277cef09..e3e06e0f7fc 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -30,4 +30,7 @@ dev_archdata_get_node(const struct dev_archdata *ad) return ad->of_node; } +struct pdev_archdata { +}; + #endif /* _ASM_POWERPC_DEVICE_H */ diff --git a/arch/sparc/include/asm/device.h b/arch/sparc/include/asm/device.h index 3702e087df2..f3b85b6b0b7 100644 --- a/arch/sparc/include/asm/device.h +++ b/arch/sparc/include/asm/device.h @@ -32,4 +32,7 @@ dev_archdata_get_node(const struct dev_archdata *ad) return ad->prom_node; } +struct pdev_archdata { +}; + #endif /* _ASM_SPARC_DEVICE_H */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 4994a20acbc..cee34e9ca45 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -13,4 +13,7 @@ struct dma_map_ops *dma_ops; #endif }; +struct pdev_archdata { +}; + #endif /* _ASM_X86_DEVICE_H */ diff --git a/include/asm-generic/device.h b/include/asm-generic/device.h index c17c9600f22..d7c76bba640 100644 --- a/include/asm-generic/device.h +++ b/include/asm-generic/device.h @@ -9,4 +9,7 @@ struct dev_archdata { }; +struct pdev_archdata { +}; + #endif /* _ASM_GENERIC_DEVICE_H */ diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 8dc5123b630..672a6984973 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -22,6 +22,9 @@ struct platform_device { struct resource * resource; struct platform_device_id *id_entry; + + /* arch specific additions */ + struct pdev_archdata archdata; }; #define platform_get_device_id(pdev) ((pdev)->id_entry) -- cgit v1.2.3-70-g09d2 From 3885123da8335dc6b67387e5e626acbffc56f664 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:50 +0900 Subject: swiotlb: remove unused swiotlb_alloc_boot() Nobody uses swiotlb_alloc_boot(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 7 +------ 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee4420..0ac7cd52478 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,11 +13,6 @@ int swiotlb __read_mostly; -void * __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - void *swiotlb_alloc(unsigned order, unsigned long nslabs) { return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index cb1a6631b8f..94db70444c1 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -14,7 +14,6 @@ struct scatterlist; */ #define IO_TLB_SEGSIZE 128 - /* * log of the size of each IO TLB slab. The number of slabs is command line * controllable. @@ -24,7 +23,6 @@ struct scatterlist; extern void swiotlb_init(void); -extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bffe6d7ef9d..9edfdd442ed 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -114,11 +114,6 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -void * __weak __init swiotlb_alloc_boot(size_t size, unsigned long nslabs) -{ - return alloc_bootmem_low_pages(size); -} - void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) { return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); @@ -189,7 +184,7 @@ swiotlb_init_with_default_size(size_t default_size) /* * Get IO TLB memory from the low pages */ - io_tlb_start = swiotlb_alloc_boot(bytes, io_tlb_nslabs); + io_tlb_start = alloc_bootmem_low_pages(bytes); if (!io_tlb_start) panic("Cannot allocate SWIOTLB buffer"); io_tlb_end = io_tlb_start + bytes; -- cgit v1.2.3-70-g09d2 From bb52196be37ce154ddc50b1f39496146d181cbe7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:51 +0900 Subject: swiotlb: remove unused swiotlb_alloc() Nobody uses swiotlb_alloc(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 8 ++------ 3 files changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 0ac7cd52478..ea675cfe76f 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,11 +13,6 @@ int swiotlb __read_mostly; -void *swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) { return paddr; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 94db70444c1..6bc50944040 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -23,8 +23,6 @@ struct scatterlist; extern void swiotlb_init(void); -extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); - extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t address); extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 9edfdd442ed..3c4c21cdf43 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -114,11 +114,6 @@ setup_io_tlb_npages(char *str) __setup("swiotlb=", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ -void * __weak swiotlb_alloc(unsigned order, unsigned long nslabs) -{ - return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order); -} - dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) { return paddr; @@ -240,7 +235,8 @@ swiotlb_late_init_with_default_size(size_t default_size) bytes = io_tlb_nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - io_tlb_start = swiotlb_alloc(order, io_tlb_nslabs); + io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); if (io_tlb_start) break; order--; -- cgit v1.2.3-70-g09d2 From cf56e3f2e8a8d5b7bc719980869b0e7985c256f3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:52 +0900 Subject: swiotlb: remove swiotlb_arch_range_needs_mapping Nobody uses swiotlb_arch_range_needs_mapping(). Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/x86/kernel/pci-swiotlb.c | 5 ----- include/linux/swiotlb.h | 2 -- lib/swiotlb.c | 15 ++------------- 3 files changed, 2 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index ea675cfe76f..165bd7f93bb 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -23,11 +23,6 @@ phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) return baddr; } -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 6bc50944040..a977da24f17 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -28,8 +28,6 @@ extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t address); -extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); - extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 3c4c21cdf43..dc1cd1f5369 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -141,11 +141,6 @@ int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); } -int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return 0; -} - static void swiotlb_print_info(unsigned long bytes) { phys_addr_t pstart, pend; @@ -312,11 +307,6 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } -static inline int range_needs_mapping(phys_addr_t paddr, size_t size) -{ - return swiotlb_force || swiotlb_arch_range_needs_mapping(paddr, size); -} - static int is_swiotlb_buffer(char *addr) { return addr >= io_tlb_start && addr < io_tlb_end; @@ -646,8 +636,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(dev, dev_addr, size) && - !range_needs_mapping(phys, size)) + if (!address_needs_mapping(dev, dev_addr, size) && !swiotlb_force) return dev_addr; /* @@ -810,7 +799,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, phys_addr_t paddr = sg_phys(sg); dma_addr_t dev_addr = swiotlb_phys_to_bus(hwdev, paddr); - if (range_needs_mapping(paddr, sg->length) || + if (swiotlb_force || address_needs_mapping(hwdev, dev_addr, sg->length)) { void *map = map_single(hwdev, sg_phys(sg), sg->length, dir); -- cgit v1.2.3-70-g09d2 From 99becaca86d184a4433e9fde879ff97303d7669f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:54 +0900 Subject: x86: add dma_capable() to replace is_buffer_dma_capable() dma_capable() eventually replaces is_buffer_dma_capable(), which tells if a memory area is dma-capable or not. The problem of is_buffer_dma_capable() is that it doesn't take a pointer to struct device so it doesn't work for POWERPC. Signed-off-by: FUJITA Tomonori --- arch/x86/include/asm/dma-mapping.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 1c3f9435f1c..adac59c8f69 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -55,6 +55,14 @@ extern int dma_set_mask(struct device *dev, u64 mask); extern void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag); +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return 0; + + return addr + size <= *dev->dma_mask; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) -- cgit v1.2.3-70-g09d2 From a4c2baa6e148adfb27beaf16b6fb6d465b5b3acb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:04:55 +0900 Subject: x86: replace is_buffer_dma_capable() with dma_capable Signed-off-by: FUJITA Tomonori --- arch/x86/kernel/pci-dma.c | 2 +- arch/x86/kernel/pci-gart_64.c | 5 ++--- arch/x86/kernel/pci-nommu.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..3c945c0b350 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -147,7 +147,7 @@ again: return NULL; addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size)) { + if (addr + size > dma_mask) { __free_pages(page, get_order(size)); if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d2e56b8f48e..98a827ee9ed 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -190,14 +190,13 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - return force_iommu || - !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return force_iommu || !dma_capable(dev, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - return !is_buffer_dma_capable(*dev->dma_mask, addr, size); + return !dma_capable(dev, addr, size); } /* Map a single continuous physical area into the IOMMU. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 71d412a09f3..c0a4222bf62 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { + if (hwdev && !dma_capable(hwdev, bus, size)) { if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", -- cgit v1.2.3-70-g09d2 From 8d4f5339d1ee4027c07e6b2a1cfa9dc41b0d383b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:01 +0900 Subject: x86, IA64, powerpc: add phys_to_dma() and dma_to_phys() This adds two functions, phys_to_dma() and dma_to_phys() to x86, IA64 and powerpc. swiotlb uses them. phys_to_dma() converts a physical address to a dma address. dma_to_phys() does the opposite. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce --- arch/ia64/include/asm/dma-mapping.h | 10 ++++++++++ arch/powerpc/include/asm/dma-mapping.h | 10 ++++++++++ arch/x86/include/asm/dma-mapping.h | 10 ++++++++++ 3 files changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 88d0f860394..f91829de329 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -77,6 +77,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + extern int dma_get_cache_alignment(void); static inline void diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 6ff1f8581d7..0c34371ec49 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -437,6 +437,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr + get_dma_direct_offset(dev); +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr - get_dma_direct_offset(dev); +} + #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) #ifdef CONFIG_NOT_COHERENT_CACHE diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index adac59c8f69..0ee770d23d0 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -63,6 +63,16 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) return addr + size <= *dev->dma_mask; } +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return paddr; +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return daddr; +} + static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction dir) -- cgit v1.2.3-70-g09d2 From b683d42693c4e92b838117f5c6f7b90bfa1525c9 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 10 Jul 2009 10:05:04 +0900 Subject: x86: remove unused swiotlb_phys_to_bus() and swiotlb_bus_to_phys() phys_to_dma() and dma_to_phys() are used instead of swiotlb_phys_to_bus() and swiotlb_bus_to_phys(). Signed-off-by: FUJITA Tomonori --- arch/x86/kernel/pci-swiotlb.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 165bd7f93bb..e8a35016115 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -13,16 +13,6 @@ int swiotlb __read_mostly; -dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) -{ - return paddr; -} - -phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) -{ - return baddr; -} - static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags) { -- cgit v1.2.3-70-g09d2 From 94699b04eddd4b247d871930431d6fa1a46c175e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:52:54 +0200 Subject: x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog): MCE 0 HARDWARE ERROR. This is *NOT* a software problem! Please contact your hardware vendor CPU 0 BANK 1 MCG status: MCi status: Error overflow Uncorrected error Error enabled Processor context corrupt MCA: Data CACHE Level-1 UNKNOWN Error STATUS f200000000000195 MCGSTATUS 0 [ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error) and f200000000000115 (... READ Error). To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump content of STATUS MSR before it is cleared during initialization. ] Since the bogus MCE results in a kernel taint (which in turn disables lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs by default ("mce=bootlog" boot parameter can be be used to get the old behavior). Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 07139a0578e..7bd19c7f531 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1269,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; -- cgit v1.2.3-70-g09d2 From e3346fc48204d780f92527d06df8bf6f28d603ec Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:55:09 +0200 Subject: x86, mce: fix "mce" boot option handling for CONFIG_X86_NEW_MCE "mce argument mce ignored. Please use /sys" message shouldn't be printed when using "mce" boot option. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7bd19c7f531..75919440a18 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1549,8 +1549,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) -- cgit v1.2.3-70-g09d2 From 419d6162c0c0103fa2f44f6691dff9cac14c650d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:00 +0200 Subject: x86, mce: add missing __cpuinit tags mce_cap_init() and mce_cpu_quirks() can be tagged with __cpuinit. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 75919440a18..1ce6db1f878 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1158,7 +1158,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1222,7 +1222,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { -- cgit v1.2.3-70-g09d2 From d0c87d1f61704ed589fc0788bedd753632340e98 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:37 +0200 Subject: x86, mce: remove never executed code fseverities_coverage is never NULL in err_out code path. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f9705..51f7c725dab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); if (dmce) debugfs_remove(dmce); return -ENOMEM; -- cgit v1.2.3-70-g09d2 From f3a0867b12e0cf1512c0bd0665f2339fc75ed2a8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 29 Jul 2009 00:04:59 +0200 Subject: x86, mce: fix reporting of Thermal Monitoring mechanism enabled Early Pentium M models use different method for enabling TM2 (per paragraph 13.5.2.3 of the "Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide, Part 1"). Tested on the affected Pentium M variant (model == 13). Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/mcheck/therm_throt.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d1ce094586..cbec06deb68 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd..15f2bc07bb6 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); -- cgit v1.2.3-70-g09d2 From c1dc0b9c0c8979ce4d411caadff5c0d79dee58bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 2 Aug 2009 11:28:21 +0200 Subject: debug lockups: Improve lockup detection When debugging a recent lockup bug i found various deficiencies in how our current lockup detection helpers work: - SysRq-L is not very efficient as it uses a workqueue, hence it cannot punch through hard lockups and cannot see through most soft lockups either. - The SysRq-L code depends on the NMI watchdog - which is off by default. - We dont print backtraces from the RCU code's built-in 'RCU state machine is stuck' debug code. This debug code tends to be one of the first (and only) mechanisms that show that a lockup has occured. This patch changes the code so taht we: - Trigger the NMI backtrace code from SysRq-L instead of using a workqueue (which cannot punch through hard lockups) - Trigger print-all-CPU-backtraces from the RCU lockup detection code Also decouple the backtrace printing code from the NMI watchdog: - Dont use variable size cpumasks (it might not be initialized and they are a bit more fragile anyway) - Trigger an NMI immediately via an IPI, instead of waiting for the NMI tick to occur. This is a lot faster and can produce more relevant backtraces. It will also work if the NMI watchdog is disabled. - Dont print the 'dazed and confused' message when we print a backtrace from the NMI - Do a show_regs() plus a dump_stack() to get maximum info out of the dump. Worst-case we get two stacktraces - which is not a big deal. Sometimes, if register content is corrupted, the precise stack walker in show_regs() wont give us a full backtrace - in this case dump_stack() will do it. Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/nmi.c | 18 ++++++++++++------ drivers/char/sysrq.c | 8 ++------ kernel/rcutree.c | 7 ++++++- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63..1bb1ac20e9e 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -39,7 +39,7 @@ int unknown_nmi_panic; int nmi_watchdog_enabled; -static cpumask_var_t backtrace_mask; +static cpumask_t backtrace_mask __read_mostly; /* nmi_active: * >0: the lapic NMI watchdog is active, but can be disabled @@ -138,7 +138,6 @@ int __init check_nmi_watchdog(void) if (!prev_nmi_count) goto error; - alloc_cpumask_var(&backtrace_mask, GFP_KERNEL|__GFP_ZERO); printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP @@ -415,14 +414,17 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) } /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (backtrace_mask != NULL && cpumask_test_cpu(cpu, backtrace_mask)) { + if (cpumask_test_cpu(cpu, &backtrace_mask)) { static DEFINE_SPINLOCK(lock); /* Serialise the printks */ spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); dump_stack(); spin_unlock(&lock); - cpumask_clear_cpu(cpu, backtrace_mask); + cpumask_clear_cpu(cpu, &backtrace_mask); + + rc = 1; } /* Could check oops_in_progress here too, but it's safer not to */ @@ -556,10 +558,14 @@ void __trigger_all_cpu_backtrace(void) { int i; - cpumask_copy(backtrace_mask, cpu_online_mask); + cpumask_copy(&backtrace_mask, cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(backtrace_mask)) + if (cpumask_empty(&backtrace_mask)) break; mdelay(1); } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d7a02f63e1..165f307f30e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -222,12 +223,7 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - struct pt_regs *regs = get_irq_regs(); - if (regs) { - printk(KERN_INFO "CPU%d:\n", smp_processor_id()); - show_regs(regs); - } - schedule_work(&sysrq_showallcpus); + trigger_all_cpu_backtrace(); } static struct sysrq_key_op sysrq_showallcpus_op = { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7717b95c202..9c5fa9fc57e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -469,6 +470,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp) } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); + trigger_all_cpu_backtrace(); + force_quiescent_state(rsp, 0); /* Kick them all. */ } @@ -479,12 +482,14 @@ static void print_cpu_stall(struct rcu_state *rsp) printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", smp_processor_id(), jiffies - rsp->gp_start); - dump_stack(); + trigger_all_cpu_backtrace(); + spin_lock_irqsave(&rnp->lock, flags); if ((long)(jiffies - rsp->jiffies_stall) >= 0) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; spin_unlock_irqrestore(&rnp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ } -- cgit v1.2.3-70-g09d2 From 25f6e89bedd29cc49bfa0d55497e91a671b9ae6e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Thu, 30 Jul 2009 23:21:18 +0200 Subject: x86: Remove superfluous NULL pointer check in destroy_irq() This takes care of the following entry from Dan's list: arch/x86/kernel/apic/io_apic.c +3241 destroy_irq(11) warning: variable derefenced before check 'desc' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Jonathan Corbet Cc: Eugene Teo Cc: Julia Lawall LKML-Reference: <200907302321.19086.bzolnier@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cf51b0b58c5..7e92a9212fd 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3185,8 +3185,7 @@ void destroy_irq(unsigned int irq) cfg = desc->chip_data; dynamic_irq_cleanup(irq); /* connect back irq_cfg */ - if (desc) - desc->chip_data = cfg; + desc->chip_data = cfg; free_irte(irq); spin_lock_irqsave(&vector_lock, flags); -- cgit v1.2.3-70-g09d2 From 47cab6a722d44c71c4f8224017ef548522243cf4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Aug 2009 09:31:54 +0200 Subject: debug lockups: Improve lockup detection, fix generic arch fallback As Andrew noted, my previous patch ("debug lockups: Improve lockup detection") broke/removed SysRq-L support from architecture that do not provide a __trigger_all_cpu_backtrace implementation. Restore a fallback path and clean up the SysRq-L machinery a bit: - Rename the arch method to arch_trigger_all_cpu_backtrace() - Simplify the define - Document the method a bit - in the hope of more architectures adding support for it. [ The patch touches Sparc code for the rename. ] Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: "David S. Miller" LKML-Reference: <20090802140809.7ec4bb6b.akpm@linux-foundation.org> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/irq_64.h | 4 ++-- arch/sparc/kernel/process_64.c | 4 ++-- arch/x86/include/asm/nmi.h | 4 ++-- arch/x86/kernel/apic/nmi.c | 2 +- drivers/char/sysrq.c | 15 ++++++++++++++- include/linux/nmi.h | 19 +++++++++++++++++-- 6 files changed, 38 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index 1934f2cbf51..a0b443cb3c1 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -89,8 +89,8 @@ static inline unsigned long get_softint(void) return retval; } -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace extern void *hardirq_stack[NR_CPUS]; extern void *softirq_stack[NR_CPUS]; diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 4041f94e772..18d67854a1b 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -251,7 +251,7 @@ static void __global_reg_poll(struct global_reg_snapshot *gp) } } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { struct thread_info *tp = current_thread_info(); struct pt_regs *regs = get_irq_regs(); @@ -304,7 +304,7 @@ void __trigger_all_cpu_backtrace(void) static void sysrq_handle_globreg(int key, struct tty_struct *tty) { - __trigger_all_cpu_backtrace(); + arch_trigger_all_cpu_backtrace(); } static struct sysrq_key_op sparc_globalreg_op = { diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index c86e5ed4af5..e63cf7d441e 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -45,8 +45,8 @@ extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; -void __trigger_all_cpu_backtrace(void); -#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() +void arch_trigger_all_cpu_backtrace(void); +#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace static inline void localise_nmi_watchdog(void) { diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1bb1ac20e9e..db7220220d0 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -554,7 +554,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu) return 0; } -void __trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(void) { int i; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 165f307f30e..50eecfe1d72 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -223,7 +223,20 @@ static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) { - trigger_all_cpu_backtrace(); + /* + * Fall back to the workqueue based printing if the + * backtrace printing did not succeed or the + * architecture has no support for it: + */ + if (!trigger_all_cpu_backtrace()) { + struct pt_regs *regs = get_irq_regs(); + + if (regs) { + printk(KERN_INFO "CPU%d:\n", smp_processor_id()); + show_regs(regs); + } + schedule_work(&sysrq_showallcpus); + } } static struct sysrq_key_op sysrq_showallcpus_op = { diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 29af2d5df09..b752e807add 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -28,8 +28,23 @@ static inline void acpi_nmi_disable(void) { } static inline void acpi_nmi_enable(void) { } #endif -#ifndef trigger_all_cpu_backtrace -#define trigger_all_cpu_backtrace() do { } while (0) +/* + * Create trigger_all_cpu_backtrace() out of the arch-provided + * base function. Return whether such support was available, + * to allow calling code to fall back to some other mechanism: + */ +#ifdef arch_trigger_all_cpu_backtrace +static inline bool trigger_all_cpu_backtrace(void) +{ + arch_trigger_all_cpu_backtrace(); + + return true; +} +#else +static inline bool trigger_all_cpu_backtrace(void) +{ + return false; +} #endif #endif -- cgit v1.2.3-70-g09d2 From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 10:25:58 +0100 Subject: agp: kill phys_to_gart() and gart_to_phys() There seems to be no reason for these -- they're a 1:1 mapping on all platforms. Signed-off-by: David Woodhouse --- arch/alpha/include/asm/agp.h | 4 ---- arch/ia64/include/asm/agp.h | 4 ---- arch/parisc/include/asm/agp.h | 4 ---- arch/powerpc/include/asm/agp.h | 4 ---- arch/sparc/include/asm/agp.h | 4 ---- arch/x86/include/asm/agp.h | 4 ---- drivers/char/agp/agp.h | 3 --- drivers/char/agp/ali-agp.c | 4 ++-- drivers/char/agp/amd-k7-agp.c | 8 ++++---- drivers/char/agp/amd64-agp.c | 6 +++--- drivers/char/agp/ati-agp.c | 6 +++--- drivers/char/agp/backend.c | 2 +- drivers/char/agp/efficeon-agp.c | 4 ++-- drivers/char/agp/generic.c | 6 +++--- drivers/char/agp/hp-agp.c | 4 ++-- drivers/char/agp/i460-agp.c | 4 ++-- drivers/char/agp/intel-agp.c | 7 +++---- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/sgi-agp.c | 2 +- drivers/char/agp/sworks-agp.c | 8 ++++---- drivers/char/agp/uninorth-agp.c | 2 +- 21 files changed, 32 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 26c17913529..a94d48b8677 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -9,10 +9,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h index c11fdd8ab4d..01d09c401c5 100644 --- a/arch/ia64/include/asm/agp.h +++ b/arch/ia64/include/asm/agp.h @@ -17,10 +17,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h index 9651660da63..d226ffa8fc1 100644 --- a/arch/parisc/include/asm/agp.h +++ b/arch/parisc/include/asm/agp.h @@ -11,10 +11,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h index 86455c4c31e..416e12c2d50 100644 --- a/arch/powerpc/include/asm/agp.h +++ b/arch/powerpc/include/asm/agp.h @@ -8,10 +8,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h index c2456870b05..70f52c1661b 100644 --- a/arch/sparc/include/asm/agp.h +++ b/arch/sparc/include/asm/agp.h @@ -7,10 +7,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b..eec2a70d437 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 4c6e5079d87..d6f36c004d9 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -318,9 +318,6 @@ void agp3_generic_cleanup(void); #define AGP_GENERIC_SIZES_ENTRIES 11 extern const struct aper_size_info_16 agp3_generic_sizes[]; -#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - extern int agp_off; extern int agp_try_unsupported_boot; diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 201ef3ffd48..d2ce68f27e4 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN )); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN )); return page; } @@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN)); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN)); } agp_generic_destroy_page(page, flags); } diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index 542a87895ae..73dbf40c874 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map) #ifndef CONFIG_X86 SetPageReserved(virt_to_page(page_map->real)); global_cache_flush(); - page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), + page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), PAGE_SIZE); if (page_map->remapped == NULL) { ClearPageReserved(virt_to_page(page_map->real)); @@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } @@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type) addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_generic_mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); readl(cur_gatt+GET_GATT_OFF(addr)); /* PCI Posting. */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e85a5b3e952..2fb2e6cc322 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); @@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] = static int amd_8151_configure(void) { - unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); + unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; /* Configure AGP regs in each x86-64 host bridge. */ @@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev) { struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - release_mem_region(virt_to_gart(bridge->gatt_table_real), + release_mem_region(virt_to_phys(bridge->gatt_table_real), amd64_aperture_sizes[bridge->aperture_size_idx].size); agp_remove_bridge(bridge); agp_put_bridge(bridge); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 59ebd60c1b6..3b2ecbe86eb 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } @@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Write out the size register */ current_size = A_SIZE_LVL2(agp_bridge->current_size); @@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 343f102090a..ad87753f6de 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) goto err_out_nounmap; } } else { - bridge->scratch_page_dma = phys_to_gart(page_to_phys(page)); + bridge->scratch_page_dma = page_to_phys(page); } bridge->scratch_page = bridge->driver->mask_memory(bridge, diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 35d50f2861b..793f39ea961 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] = /* This function does the same thing as mask_memory() for this chipset... */ static inline unsigned long efficeon_mask_memory(struct page *page) { - unsigned long addr = phys_to_gart(page_to_phys(page)); + unsigned long addr = page_to_phys(page); return addr | 0x00000001; } @@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_private.l1_table[index] = page; - value = virt_to_gart((unsigned long *)page) | pati | present | index; + value = virt_to_phys((unsigned long *)page) | pati | present | index; pci_write_config_dword(agp_bridge->dev, EFFICEON_ATTPAGE, value); diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 28f0208c66a..c50543966eb 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) set_memory_uc((unsigned long)table, 1 << page_order); bridge->gatt_table = (void *)table; #else - bridge->gatt_table = ioremap_nocache(virt_to_gart(table), + bridge->gatt_table = ioremap_nocache(virt_to_phys(table), (PAGE_SIZE * (1 << page_order))); bridge->driver->cache_flush(); #endif @@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) return -ENOMEM; } - bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real); + bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); /* AK: bogus, should encode addresses > 4GB */ for (i = 0; i < num_entries; i++) { @@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type), bridge->gatt_table+j); } diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 64dbf4b1cf2..501e293e5ad 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void) hp->gart_size = HP_ZX1_GART_SIZE; hp->gatt_entries = hp->gart_size / hp->io_page_size; - hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); + hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { @@ -246,7 +246,7 @@ hp_zx1_configure (void) agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); if (hp->io_pdir_owner) { - writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); + writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); readl(hp->ioc_regs+HP_ZX1_TCNFG); diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 54191f86053..e763d3312ce 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem, io_page_size = 1UL << I460_IO_PAGE_SHIFT; for (i = 0, j = io_pg_start; i < mem->page_count; i++) { - paddr = phys_to_gart(page_to_phys(mem->pages[i])); + paddr = page_to_phys(mem->pages[i]); for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size) WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type)); } @@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp) return -ENOMEM; } - lp->paddr = phys_to_gart(page_to_phys(lp->page)); + lp->paddr = page_to_phys(lp->page); lp->refcount = 0; atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); return 0; diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index d8c80d8be5e..aa8889e8afc 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.gtt+j); } @@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, global_cache_flush(); for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), - mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); @@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index cedacee30ec..7e36d2b4f9d 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), agp_bridge->gatt_table+nvidia_private.pg_offset+j); } diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index 0d47fa84740..0d426ae39c8 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { table[j] = bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type); } diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index 07259952fc3..13acaaf64ed 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Create a fake scratch directory */ for (i = 0; i < 1024; i++) { writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); - writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); } retval = serverworks_create_gatt_pages(value->num_entries / 1024); @@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++) - writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); return 0; } @@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = SVRWRKS_GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mem->type), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } serverworks_tlbflush(mem); diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index f192c3b9ad4..2e993112ab8 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) bridge->gatt_table_real = (u32 *) table; bridge->gatt_table = (u32 *)table; - bridge->gatt_bus_addr = virt_to_gart(table); + bridge->gatt_bus_addr = virt_to_phys(table); for (i = 0; i < num_entries; i++) bridge->gatt_table[i] = 0; -- cgit v1.2.3-70-g09d2 From ed8d9adf357ec331603fa1049510399812cea7e5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 14:08:48 +0900 Subject: x86, percpu: Add 'percpu_read_stable()' interface for cacheable accesses This is very useful for some common things like 'get_current()' and 'get_thread_info()', which can be used multiple times in a function, and where the result is cacheable. tj: Added the magical undocumented "P" modifier to UP __percpu_arg() to force gcc to dereference the pointer value passed in via the "p" input constraint. Without this, percpu_read_stable() returns the address of the percpu variable. Also added comment explaining the difference between percpu_read() and percpu_read_stable(). Signed-off-by: Linus Torvalds Signed-off-by: Tejun Heo Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/current.h | 2 +- arch/x86/include/asm/percpu.h | 26 +++++++++++++++++++------- arch/x86/include/asm/thread_info.h | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h index c68c361697e..4d447b732d8 100644 --- a/arch/x86/include/asm/current.h +++ b/arch/x86/include/asm/current.h @@ -11,7 +11,7 @@ DECLARE_PER_CPU(struct task_struct *, current_task); static __always_inline struct task_struct *get_current(void) { - return percpu_read(current_task); + return percpu_read_stable(current_task); } #define current get_current() diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d8..04eacefcfd2 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -49,7 +49,7 @@ #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __my_cpu_offset percpu_read(this_cpu_off) #else -#define __percpu_arg(x) "%" #x +#define __percpu_arg(x) "%P" #x #endif /* @@ -104,36 +104,48 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(op, var, constraint) \ ({ \ typeof(var) ret__; \ switch (sizeof(var)) { \ case 1: \ asm(op "b "__percpu_arg(1)",%0" \ : "=q" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 2: \ asm(op "w "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 4: \ asm(op "l "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ case 8: \ asm(op "q "__percpu_arg(1)",%0" \ : "=r" (ret__) \ - : "m" (var)); \ + : constraint); \ break; \ default: __bad_percpu_size(); \ } \ ret__; \ }) -#define percpu_read(var) percpu_from_op("mov", per_cpu__##var) +/* + * percpu_read() makes gcc load the percpu variable every time it is + * accessed while percpu_read_stable() allows the value to be cached. + * percpu_read_stable() is more efficient and can be used if its value + * is guaranteed to be valid across cpus. The current users include + * get_current() and get_thread_info() both of which are actually + * per-thread variables implemented as per-cpu variables and thus + * stable for the duration of the respective task. + */ +#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ + "m" (per_cpu__##var)) +#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ + "p" (&per_cpu__##var)) #define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) #define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) #define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f..a1bb5a114bf 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -213,7 +213,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(percpu_read(kernel_stack) + + ti = (void *)(percpu_read_stable(kernel_stack) + KERNEL_STACK_OFFSET - THREAD_SIZE); return ti; } -- cgit v1.2.3-70-g09d2 From 3e352aa8ee2bd48f1a19c7742810b3a4a7ba605e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Aug 2009 14:10:11 +0900 Subject: x86, percpu: Fix DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() put percpu variables in .page_aligned section without adding any alignment restrictions. Currently, this doesn't cause any problem because all users of the macros have explicit page alignment and page-sized but it's much safer to enforce page alignment from the macros. After all, it's what they claim to do. Add __aligned(PAGE_SIZE) to DECLARE/DEFINE_PER_CPU_PAGE_ALIGNED() and drop explicit alignment from it users. Signed-off-by: Tejun Heo Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 3 +-- include/linux/percpu-defs.h | 8 +++++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..12493c5485e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1008,8 +1008,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { }; static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]) - __aligned(PAGE_SIZE); + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); /* May not be marked __init: used by software suspend */ void syscall_init(void) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 68438e18fff..afd5f8b7061 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -69,11 +69,13 @@ /* * Declaration/definition used for per-CPU variables that must be page aligned. */ -#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name) \ - DECLARE_PER_CPU_SECTION(type, name, ".page_aligned") +#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, ".page_aligned") \ + __aligned(PAGE_SIZE) #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ - DEFINE_PER_CPU_SECTION(type, name, ".page_aligned") + DEFINE_PER_CPU_SECTION(type, name, ".page_aligned") \ + __aligned(PAGE_SIZE) /* * Intermodule exports for per-CPU variables. -- cgit v1.2.3-70-g09d2 From bdf977b37418cdf8a2252504779a7e12a09b7575 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Aug 2009 14:12:19 +0900 Subject: x86, percpu: Collect hot percpu variables into one cacheline On x86_64, percpu variables current_task and kernel_stack are used for get_current() and current_thread_info() respectively and thus are often used close to each other. Move definition of current_task to kernel/cpu/common.c right above kernel_stack definition and align it to cacheline so that they always fall into the same cacheline. Two percpu variables defined there together - irq_stack_ptr and irq_count - are also pretty hot and will benefit from sharing the cacheline. For consistency, current_task definition for x86_32 is also moved to kernel/cpu/common.c. Putting current_task and kernel_stack into the same cacheline was suggested by Linus Torvalds. Signed-off-by: Tejun Heo Cc: Linus Torvalds Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 15 +++++++++++++-- arch/x86/kernel/process_32.c | 3 --- arch/x86/kernel/process_64.c | 3 --- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12493c5485e..1bd88ed978b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -987,13 +987,21 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(unsigned long, kernel_stack) = (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + DEFINE_PER_CPU(unsigned int, irq_count) = -1; /* @@ -1041,6 +1049,9 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist); #else /* CONFIG_X86_64 */ +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU(unsigned long, stack_canary); #endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 59f4524984a..daa4107be3b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -61,9 +61,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - /* * Return saved PC of a blocked thread. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ebefb5407b9..c4c675d5ba1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -55,9 +55,6 @@ asmlinkage extern void ret_from_fork(void); -DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; -EXPORT_PER_CPU_SYMBOL(current_task); - DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); -- cgit v1.2.3-70-g09d2 From c7bd0414d681706a32105895cae20fb9090db52e Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Thu, 23 Jul 2009 20:56:27 +0200 Subject: x86: Simplify the Makefile in a minor way through use of cc-ifversion Signed-off-by: Frans Pop Acked-by: Sam Ravnborg Reviewed-by: WANG Cong LKML-Reference: <200907232056.28635.elendil@planet.nl> Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b..1f3851a626d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -32,8 +32,8 @@ ifeq ($(CONFIG_X86_32),y) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: - KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ - echo $(call cc-option,-fno-unit-at-a-time); fi ;) + KBUILD_CFLAGS += $(call cc-ifversion, -lt, 0400, \ + $(call cc-option,-fno-unit-at-a-time)) # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/x86/Makefile_32.cpu -- cgit v1.2.3-70-g09d2 From 54a0bf3c2cad3fd118ea725f26a493aece6ea01d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 4 Aug 2009 15:52:38 +0200 Subject: Revert "x86: oprofile/op_model_amd.c set return values for op_amd_handle_ibs()" This reverts commit 21e70878215f620fe99ea7d7c74bc641aeec932f. Instead Andrew's patch will be applied he posted at the same time. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 827beecb67a..37d19c768d5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -195,7 +195,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 0; + return 1; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -277,10 +277,7 @@ static void op_amd_stop_ibs(void) #else static inline int op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - return 0; -} + struct op_msrs const * const msrs) { } static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3-70-g09d2 From 4680e64a88c4ce2c4e736dade99233e3def13fa7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 23 Jun 2009 12:36:08 -0700 Subject: arch/x86/oprofile/op_model_amd.c: fix op_amd_handle_ibs() return type arch/x86/oprofile/op_model_amd.c: In function 'op_amd_handle_ibs': arch/x86/oprofile/op_model_amd.c:217: warning: no return statement in function returning non-void Fix this by making op_amd_handle_ibs() return void. Cc: Robert Richter Signed-off-by: Andrew Morton Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 37d19c768d5..39686c29f03 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -187,7 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, #ifdef CONFIG_OPROFILE_IBS -static inline int +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -195,7 +195,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, struct op_entry entry; if (!has_ibs) - return 1; + return; if (ibs_config.fetch_enabled) { rdmsrl(MSR_AMD64_IBSFETCHCTL, ctl); @@ -241,8 +241,6 @@ op_amd_handle_ibs(struct pt_regs * const regs, wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } - - return 1; } static inline void op_amd_start_ibs(void) @@ -276,7 +274,7 @@ static void op_amd_stop_ibs(void) #else -static inline int op_amd_handle_ibs(struct pt_regs * const regs, +static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { } static inline void op_amd_start_ibs(void) { } static inline void op_amd_stop_ibs(void) { } -- cgit v1.2.3-70-g09d2 From 2977fb3ffc8493a2f4f0a362e8660a6cde9f1bb9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 1 Aug 2009 11:47:59 +0400 Subject: x86, ioapic: Introduce for_each_irq_pin() helper This allow us to save a few lines of code. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org LKML-Reference: <20090801075435.597863129@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 43 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7e92a9212fd..ffd8fdfcbe4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -66,6 +66,8 @@ #include #define __apicdebuginit(type) static type __init +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) /* * Is the SiS APIC rmw bug present ? @@ -410,7 +412,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) unsigned long flags; spin_lock_irqsave(&ioapic_lock, flags); - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -490,22 +492,21 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) { - struct irq_pin_list **entryp, *entry; + struct irq_pin_list **last, *entry; - for (entryp = &cfg->irq_2_pin; - *entryp != NULL; - entryp = &(*entryp)->next) { - entry = *entryp; - /* not again, please */ + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) return; + last = &entry->next; } entry = get_one_free_irq_2_pin(node); entry->apic = apic; entry->pin = pin; - *entryp = entry; + *last = entry; } /* @@ -517,7 +518,7 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, { struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; @@ -537,7 +538,7 @@ static void io_apic_modify_irq(struct irq_cfg *cfg, int pin; struct irq_pin_list *entry; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin * 2); @@ -1669,12 +1670,8 @@ __apicdebuginit(void) print_IO_APIC(void) if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); - for (;;) { + for_each_irq_pin(entry, cfg->irq_2_pin) printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } printk("\n"); } @@ -2227,7 +2224,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq struct irq_pin_list *entry; u8 vector = cfg->vector; - for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; apic = entry->apic; @@ -2556,20 +2553,10 @@ static void ack_apic_level(unsigned int irq) #ifdef CONFIG_INTR_REMAP static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) { - int apic, pin; struct irq_pin_list *entry; - entry = cfg->irq_2_pin; - for (;;) { - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; - io_apic_eoi(apic, pin); - entry = entry->next; - } + for_each_irq_pin(entry, cfg->irq_2_pin) + io_apic_eoi(entry->apic, entry->pin); } static void -- cgit v1.2.3-70-g09d2 From a7428cd2ef77734465e36bceb43290e37e2a97c6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 1 Aug 2009 11:48:00 +0400 Subject: x86, ioapic: Throw BUG instead of NULL dereference Instead of plain NULL deref we better throw error message with a backtrace. Actually we need more gracious error handling here. Meanwhile leave it as is. Signed-off-by: Cyrill Gorcunov Cc: yinghai@kernel.org LKML-Reference: <20090801075435.769301745@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ffd8fdfcbe4..2a145d3a837 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -503,6 +503,10 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin } entry = get_one_free_irq_2_pin(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list\n"); + BUG_ON(1); + } entry->apic = apic; entry->pin = pin; -- cgit v1.2.3-70-g09d2 From 9910887af84e33ba98fd6792029470ae80166208 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 23 Jul 2009 00:52:59 +0400 Subject: x86, apic: Drop redundant bit assignment cpu_has_apic has already investigated boot_cpu_data X86_FEATURE_APIC bit for being clear if condition is triggered. So there is no need to clear this bit second time. Signed-off-by: Cyrill Gorcuno v Cc: "Maciej W. Rozycki" LKML-Reference: <20090722205259.GE15805@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..0b021c56e82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1651,7 +1651,6 @@ int __init APIC_init_uniprocessor(void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } #endif -- cgit v1.2.3-70-g09d2 From ce69a784504222c3ab6f1b3c357d09ec5772127a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 20 Jul 2009 15:24:17 +0300 Subject: x86/apic: Enable x2APIC without interrupt remapping under KVM KVM would like to provide x2APIC interface to a guest without emulating interrupt remapping device. The reason KVM prefers guest to use x2APIC is that x2APIC interface is better virtualizable and provides better performance than mmio xAPIC interface: - msr exits are faster than mmio (no page table walk, emulation) - no need to read back ICR to look at the busy bit - one 64 bit ICR write instead of two 32 bit writes - shared code with the Hyper-V paravirt interface Included patch changes x2APIC enabling logic to enable it even if IR initialization failed, but kernel runs under KVM and no apic id is greater than 255 (if there is one spec requires BIOS to move to x2apic mode before starting an OS). -v2: fix build -v3: fix bug causing compiler warning Signed-off-by: Gleb Natapov Acked-by: Suresh Siddha Cc: Sheng Yang Cc: "avi@redhat.com" LKML-Reference: <20090720122417.GR5638@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 7 ++++ arch/x86/kernel/apic/apic.c | 83 +++++++++++++++++++++++------------------ arch/x86/kernel/apic/probe_64.c | 6 +-- 3 files changed, 56 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d4792584..586b7adb8e5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -183,6 +183,10 @@ static inline int x2apic_enabled(void) } #define x2apic_supported() (cpu_has_x2apic) +static inline void x2apic_force_phys(void) +{ + x2apic_phys = 1; +} #else static inline void check_x2apic(void) { @@ -194,6 +198,9 @@ static inline int x2apic_enabled(void) { return 0; } +static inline void x2apic_force_phys(void) +{ +} #define x2apic_preenabled 0 #define x2apic_supported() 0 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0b021c56e82..de039fcdd05 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -49,6 +49,7 @@ #include #include #include +#include unsigned int num_processors; @@ -1361,52 +1362,76 @@ void enable_x2apic(void) } #endif /* CONFIG_X86_X2APIC */ -void __init enable_IR_x2apic(void) +int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP int ret; - unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; ret = dmar_table_init(); if (ret) { pr_debug("dmar_table_init() failed with %d:\n", ret); - goto ir_failed; + return 0; } if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - goto ir_failed; + return 0; } - if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return; + return 0; } + if (enable_intr_remapping(x2apic_supported())) + return 0; + + pr_info("Enabled Interrupt-remapping\n"); + + return 1; + +#endif + return 0; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + struct IO_APIC_route_entry **ioapic_entries = NULL; + int ret, x2apic_enabled = 0; + ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { - pr_info("Allocate ioapic_entries failed: %d\n", ret); - goto end; + pr_err("Allocate ioapic_entries failed\n"); + goto out; } ret = save_IO_APIC_setup(ioapic_entries); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto end; + goto out; } local_irq_save(flags); - mask_IO_APIC_setup(ioapic_entries); mask_8259A(); + mask_IO_APIC_setup(ioapic_entries); - ret = enable_intr_remapping(x2apic_supported()); - if (ret) - goto end_restore; + ret = enable_IR(); + if (!ret) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || !kvm_para_available()) + goto nox2apic; + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } - pr_info("Enabled Interrupt-remapping\n"); + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { x2apic_mode = 1; @@ -1414,41 +1439,25 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -end_restore: - if (ret) - /* - * IR enabling failed - */ +nox2apic: + if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); local_irq_restore(flags); -end: +out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (!ret) + if (x2apic_enabled) return; -ir_failed: if (x2apic_preenabled) - panic("x2apic enabled by bios. But IR enabling failed"); + panic("x2apic: enabled by BIOS but kernel init failed."); else if (cpu_has_x2apic) - pr_info("Not enabling x2apic,Intr-remapping\n"); -#else - if (!cpu_has_x2apic) - return; - - if (x2apic_preenabled) - panic("x2apic enabled prior OS handover," - " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP"); -#endif - - return; + pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); } - #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b8..f3b1037076e 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -50,11 +50,11 @@ static struct apic *apic_probe[] __initdata = { void __init default_setup_apic_routing(void) { #ifdef CONFIG_X86_X2APIC - if (x2apic_mode && (apic != &apic_x2apic_phys && + if (x2apic_mode #ifdef CONFIG_X86_UV - apic != &apic_x2apic_uv_x && + && apic != &apic_x2apic_uv_x #endif - apic != &apic_x2apic_cluster)) { + ) { if (x2apic_phys) apic = &apic_x2apic_phys; else -- cgit v1.2.3-70-g09d2 From 07868b086cca784f4b532fc2ab574ec3a73b468a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 29 Jul 2009 03:33:51 +0200 Subject: tracing/function-graph-tracer: Drop the useless nmi protection The function graph tracer used to have a protection against NMI while entering a function entry tracing. But this is useless now, this tracer is reentrant and the ring buffer supports the NMI tracing. We can then drop this protection. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt --- arch/x86/kernel/ftrace.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d94e1ea3b9f..8e9663413b7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, unsigned long return_hooker = (unsigned long) &return_to_handler; - /* Nmi's are currently unsupported */ - if (unlikely(in_nmi())) - return; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; -- cgit v1.2.3-70-g09d2 From f3d1915a8623b9248572d3ee44e19a80b7a3520b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 6 Aug 2009 00:09:31 +0400 Subject: x86, ioapic: Panic on irq-pin binding only if needed Though the most time we are to panic on irq-pin allocation fails, for PCI interrupts it's not the case and we could continue operate even if irq-pin allocation failed. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090805200931.GB5319@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2a145d3a837..2999f3dd588 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -490,7 +490,8 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +static int +add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) { struct irq_pin_list **last, *entry; @@ -498,19 +499,27 @@ static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin last = &cfg->irq_2_pin; for_each_irq_pin(entry, cfg->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return; + return 0; last = &entry->next; } entry = get_one_free_irq_2_pin(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list\n"); - BUG_ON(1); + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; } entry->apic = apic; entry->pin = pin; *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); } /* @@ -3843,7 +3852,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, */ if (irq >= NR_IRQS_LEGACY) { cfg = desc->chip_data; - add_pin_to_irq_node(cfg, node, ioapic, pin); + if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { + printk(KERN_INFO "can not add pin %d for irq %d\n", + pin, irq); + return 0; + } } setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); -- cgit v1.2.3-70-g09d2 From 1e5de18278e6862f4198412b5059a03770fa816a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 19 Jul 2009 00:12:20 +0900 Subject: x86: Introduce GDT_ENTRY_INIT() GDT_ENTRY_INIT is static initializer of desc_struct. We already have similar macro GDT_ENTRY() but it's static initializer for u64 and it cannot be used for desc_struct. Signed-off-by: Akinobu Mita LKML-Reference: <20090718151219.GD11294@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc_defs.h | 6 ++++++ arch/x86/include/asm/lguest.h | 5 +++-- arch/x86/include/asm/stackprotector.h | 2 +- arch/x86/kernel/apm_32.c | 2 +- arch/x86/kernel/cpu/common.c | 40 +++++++++++++++++------------------ drivers/pnp/pnpbios/bioscalls.c | 5 +---- 6 files changed, 32 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h index a6adefa28b9..9d6684849fd 100644 --- a/arch/x86/include/asm/desc_defs.h +++ b/arch/x86/include/asm/desc_defs.h @@ -34,6 +34,12 @@ struct desc_struct { }; } __attribute__((packed)); +#define GDT_ENTRY_INIT(flags, base, limit) { { { \ + .a = ((limit) & 0xffff) | (((base) & 0xffff) << 16), \ + .b = (((base) & 0xff0000) >> 16) | (((flags) & 0xf0ff) << 8) | \ + ((limit) & 0xf0000) | ((base) & 0xff000000), \ + } } } + enum { GATE_INTERRUPT = 0xE, GATE_TRAP = 0xF, diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d..94cd69858b1 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -91,8 +91,9 @@ static inline void lguest_set_ts(void) } /* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } }) -#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } }) +#define FULL_EXEC_SEGMENT \ + ((struct desc_struct)GDT_ENTRY_INIT(0xc09b, 0, 0xfffff)) +#define FULL_SEGMENT ((struct desc_struct)GDT_ENTRY_INIT(0xc093, 0, 0xfffff)) #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index cdc5e0b126a..44efdff3975 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -48,7 +48,7 @@ * head_32 for boot CPU and setup_per_cpu_areas() for others. */ #define GDT_STACK_CANARY_INIT \ - [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, + [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18), /* * Initialize the stackprotector canary value. diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5e841bd60d..febb2dab254 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); static const char driver_version[] = "1.16ac"; /* no spaces */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1961c07af9..8c9bc287f8f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -71,45 +71,45 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { * TLS descriptors are currently at a different place compared to i386. * Hopefully nobody expects them at a fixed place (Wine?) */ - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), #else - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), - [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), GDT_STACK_CANARY_INIT #endif } }; diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 45ad3e9cc36..bd035e3d355 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -60,7 +60,7 @@ do { \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) -static struct desc_struct bad_bios_desc; +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); /* * At some point we want to use this stack frame pointer to unwind @@ -476,9 +476,6 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.offset = header->fields.pm16offset; pnp_bios_callpoint.segment = PNP_CS16; - bad_bios_desc.a = 0; - bad_bios_desc.b = 0x00409200; - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { -- cgit v1.2.3-70-g09d2 From 72c4d8530244264317a662de9a55cc47e6c8e9df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 3 Aug 2009 08:47:07 +0200 Subject: x86: Introduce GDT_ENTRY_INIT(), fix APM This crash: [ 0.891983] calling cache_sysfs_init+0x0/0x1ee @ 1 [ 0.897251] initcall cache_sysfs_init+0x0/0x1ee returned 0 after 405 usecs [ 0.904019] calling mce_init_device+0x0/0x242 @ 1 [ 0.909124] initcall mce_init_device+0x0/0x242 returned 0 after 347 usecs [ 0.915815] calling apm_init+0x0/0x38d @ 1 [ 0.919967] apm: BIOS version 1.2 Flags 0x07 (Driver version 1.16ac) [ 0.926813] general protection fault: 0000 [#1] [ 0.927269] last sysfs file: [ 0.927269] Modules linked in: [ 0.927269] [ 0.927269] Pid: 271, comm: kapmd Not tainted (2.6.31-rc3-00100-gd520da1-dirty #311) System Product Name [ 0.927269] EIP: 00c0:[<000082b2>] EFLAGS: 00010002 CPU: 0 [ 0.927269] EIP is at 0x82b2 [ 0.927269] EAX: 0000530e EBX: 00000000 ECX: 00000102 EDX: 00000000 [ 0.927269] ESI: 00000000 EDI: f6a4bf44 EBP: 67890000 ESP: f6a4beec [ 0.927269] DS: 00c8 ES: 0000 FS: 0000 GS: 0000 SS: 0068 [ 0.927269] Process kapmd (pid: 271, ti=f6a4a000 task=f7142280 task.ti=f6a4a000) [ 0.927269] Stack: [ 0.927269] 0000828d 02160000 00b88092 f6a4bf3c c102a63d 00000060 f6a4bf3c f6a4bf44 [ 0.927269] <0> 0000007b 0000007b 00000000 00000000 00000000 00000000 560aae9e 00000000 [ 0.927269] <0> 00000200 f705fd74 00000000 c102af70 f6a4bf60 c102a6ec 0000530e 00000000 [ 0.927269] Call Trace: [ 0.927269] [] ? __apm_bios_call_simple+0x7d/0x110 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? apm_bios_call_simple+0x1c/0x50 [ 0.927269] [] ? apm+0x485/0x6a0 [ 0.927269] [] ? finish_task_switch+0x2a/0xb0 [ 0.927269] [] ? schedule+0x31e/0x480 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? apm+0x0/0x6a0 [ 0.927269] [] ? kthread+0x74/0x80 [ 0.927269] [] ? kthread+0x0/0x80 [ 0.927269] [] ? kernel_thread_helper+0x7/0x10 [ 0.927269] Code: Bad EIP value. [ 0.927269] EIP: [<000082b2>] 0x82b2 SS:ESP 0068:f6a4beec [ 0.927269] ---[ end trace a7919e7f17c0a725 ]--- [ 0.927269] Kernel panic - not syncing: Fatal exception [ 0.927269] Pid: 271, comm: kapmd Tainted: G D 2.6.31-rc3-00100-gd520da1-dirty #311 Is caused by an incorrect GDT_ENTRY_INIT() conversion in the apm code, as noticed by hpa. Reported-by: Ingo Molnar Noticed-by: "H. Peter Anvin" Signed-off-by: Akinobu Mita LKML-Reference: <20090808094905.GA2954@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8c9bc287f8f..b5764a26964 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -106,7 +106,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { /* 16-bit code */ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), -- cgit v1.2.3-70-g09d2 From 4c711576b90cc36c13b94816a953a8de6a53d03c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 6 Aug 2009 15:58:12 -0700 Subject: x86, 32-bit: Use generic sys_pipe() As suggested by Al, it's better to use the generic sys_pipe() for ia32. Signed-off-by: WANG Cong Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/ia32/sys_ia32.c | 14 -------------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e590261ba05..ba331bfd111 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -537,7 +537,7 @@ ia32_sys_call_table: .quad sys_mkdir .quad sys_rmdir /* 40 */ .quad sys_dup - .quad sys32_pipe + .quad sys_pipe .quad compat_sys_times .quad quiet_ni_syscall /* old prof syscall holder */ .quad sys_brk /* 45 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 085a8c35f14..9f552719882 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -189,20 +189,6 @@ asmlinkage long sys32_mprotect(unsigned long start, size_t len, return sys_mprotect(start, len, prot); } -asmlinkage long sys32_pipe(int __user *fd) -{ - int retval; - int fds[2]; - - retval = do_pipe_flags(fds, 0); - if (retval) - goto out; - if (copy_to_user(fd, fds, sizeof(fds))) - retval = -EFAULT; -out: - return retval; -} - asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, struct sigaction32 __user *oact, unsigned int sigsetsize) -- cgit v1.2.3-70-g09d2 From 30dd568c912602b7dbd609a45d053e01b13422bb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 21 Jul 2009 15:56:48 +0200 Subject: x86, perf_counter, bts: Add BTS support to perfcounters Implement a performance counter with: attr.type = PERF_TYPE_HARDWARE attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS attr.sample_period = 1 Using branch trace store (BTS) on x86 hardware, if available. The from and to address for each branch can be sampled using: PERF_SAMPLE_IP for the from address PERF_SAMPLE_ADDR for the to address [ v2: address review feedback, fix bugs ] Signed-off-by: Markus Metzger Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 10 ++ arch/x86/kernel/cpu/perf_counter.c | 325 +++++++++++++++++++++++++++++++++++- include/linux/perf_counter.h | 2 + kernel/perf_counter.c | 10 +- 4 files changed, 340 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index fa64e401589..e7b7c938ae2 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,16 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed counter range, since lower + * values are used by actual fixed counters and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(void); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a7aa8f90095..b237c181aa4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -6,6 +6,7 @@ * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, * * For licencing details see kernel-base/COPYING */ @@ -20,6 +21,7 @@ #include #include #include +#include #include #include @@ -27,12 +29,52 @@ static u64 perf_counter_mask __read_mostly; +/* The maximal number of PEBS counters: */ +#define MAX_PEBS_COUNTERS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; +}; + struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; int enabled; + struct debug_store *ds; }; /* @@ -57,6 +99,8 @@ struct x86_pmu { u64 counter_mask; u64 max_period; u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); }; static struct x86_pmu x86_pmu __read_mostly; @@ -576,6 +620,9 @@ x86_perf_counter_update(struct perf_counter *counter, u64 prev_raw_count, new_raw_count; s64 delta; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * Careful: an NMI might modify the previous counter value. * @@ -659,10 +706,109 @@ static void release_pmc_hardware(void) enable_lapic_nmi_watchdog(); } +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_counters, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_counters, cpu).ds = NULL; + + kfree((void *)(long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return -EOPNOTSUPP; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_counters, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + static void hw_perf_counter_destroy(struct perf_counter *counter) { if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); + release_bts_hardware(); mutex_unlock(&pmc_reserve_mutex); } } @@ -705,6 +851,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) return 0; } +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + /* * Setup the hardware configuration for a given attr_type */ @@ -721,9 +903,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter) err = 0; if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) - err = -EBUSY; - else + if (atomic_read(&active_counters) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_bts_hardware(); + } + if (!err) atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } @@ -801,7 +987,18 @@ static void p6_pmu_disable_all(void) static void intel_pmu_disable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); } static void amd_pmu_disable_all(void) @@ -859,7 +1056,25 @@ static void p6_pmu_enable_all(void) static void intel_pmu_enable_all(void) { + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_counter *counter = + cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!counter)) + return; + + intel_pmu_enable_bts(counter->hw.config); + } } static void amd_pmu_enable_all(void) @@ -946,6 +1161,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) static inline void intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc, idx); return; @@ -974,6 +1194,9 @@ x86_perf_counter_set_period(struct perf_counter *counter, s64 period = hwc->sample_period; int err, ret = 0; + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + /* * If we are way outside a reasoable range then just skip forward: */ @@ -1056,6 +1279,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_counters).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc, idx); return; @@ -1077,11 +1308,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + if (!x86_pmu.num_counters_fixed) return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) @@ -1102,7 +1338,25 @@ static int x86_pmu_enable(struct perf_counter *counter) int idx; idx = fixed_mode_idx(counter, hwc); - if (idx >= 0) { + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* + * Try to use BTS for branch tracing. If that is not + * available, try to get a generic counter. + */ + if (unlikely(!cpuc->ds)) + goto try_generic; + + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = 0; + hwc->counter_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { /* * Try to get the fixed counter, if that is already taken * then try to get a generic counter: @@ -1213,6 +1467,45 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, + struct perf_sample_data *data) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; + unsigned long orig_ip = data->regs->ip; + u64 at; + + if (!counter) + return; + + if (!ds) + return; + + for (at = ds->bts_buffer_base; + at < ds->bts_index; + at += sizeof(struct bts_record)) { + struct bts_record *rec = (struct bts_record *)(long)at; + + data->regs->ip = rec->from; + data->addr = rec->to; + + perf_counter_output(counter, 1, data); + } + + ds->bts_index = ds->bts_buffer_base; + + data->regs->ip = orig_ip; + data->addr = 0; + + /* There's new data available. */ + counter->pending_kill = POLL_IN; +} + static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1237,6 +1530,15 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct perf_sample_data data; + struct pt_regs regs; + + data.regs = ®s; + intel_pmu_drain_bts_buffer(cpuc, &data); + } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1264,6 +1566,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter) static void intel_pmu_reset(void) { + struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; unsigned long flags; int idx; @@ -1281,6 +1584,8 @@ static void intel_pmu_reset(void) for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); } + if (ds) + ds->bts_index = ds->bts_buffer_base; local_irq_restore(flags); } @@ -1346,6 +1651,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); + intel_pmu_drain_bts_buffer(cpuc, &data); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1547,6 +1853,8 @@ static struct x86_pmu intel_pmu = { * the generic counter period: */ .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, }; static struct x86_pmu amd_pmu = { @@ -1936,3 +2244,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } + +void hw_perf_counter_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d0..0a6f3209c9d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -692,6 +692,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d6294..bf8110b35c5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4566,6 +4567,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4590,6 +4596,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3-70-g09d2 From 9f51e24ee8b5a1595b6a5ac0c2be278a16488e75 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 9 Aug 2009 21:54:00 +0200 Subject: x86: Use printk_once() Signed-off-by: Marcin Slusarz Cc: "H. Peter Anvin" LKML-Reference: <1249847649-11631-6-git-send-email-marcin.slusarz@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 5 ++--- arch/x86/kvm/x86.c | 7 +------ 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 3b09634a515..7d35d0fe232 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -218,7 +218,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) void fixup_irqs(void) { unsigned int irq; - static int warned; struct irq_desc *desc; for_each_irq_desc(irq, desc) { @@ -236,8 +235,8 @@ void fixup_irqs(void) } if (desc->chip->set_affinity) desc->chip->set_affinity(irq, affinity); - else if (desc->action && !(warned++)) - printk("Cannot set affinity for irq %i\n", irq); + else if (desc->action) + printk_once("Cannot set affinity for irq %i\n", irq); } #if 0 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe5474aec41..0572c90f0c8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2261,12 +2261,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu) { - static int reported; - - if (!reported) { - reported = 1; - printk(KERN_WARNING "kvm: emulating exchange as write\n"); - } + printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); #ifndef CONFIG_X86_64 /* guests cmpxchg8b have to be emulated atomically */ if (bytes == 8) { -- cgit v1.2.3-70-g09d2 From a8ad568dd8ca122aa8048ea067d3599820d1c1b4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Aug 2009 11:53:10 +0900 Subject: dma-ops: Remove flush_write_buffers() in dma-mapping-common.h This moves flush_write_buffers() in asm-generic/dma-mapping-common.h to arch/x86/kernel/pci-nommu.c. The purpose of this patch is that, we can avoid defining NULL flush_write_buffers() on IA64 and SPARC. dma-mapping-common.h is used by X86 and IA64 (and SPARC soon) but only X86 with CONFIG_X86_OOSTORE or CONFIG_X86_PPRO_FENCE actually uses flush_write_buffers(). CONFIG_X86_OOSTORE or CONFIG_X86_PPRO_FENCE is usable with only kernel/pci-nommu.c (that is, not usable with other X86 IOMMU implementations such as SWIOTLB, VT-d, etc) so we can safely move flush_write_buffers() in asm-generic/dma-mapping-common.h to arch/x86/kernel/pci-nommu.c. The further discussion is: http://lkml.org/lkml/2009/6/28/104 Signed-off-by: Arnd Bergmann Acked-by: FUJITA Tomonori Cc: davem@davemloft.net Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com LKML-Reference: <1249872797-1314-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 27 ++++++++++++++++++++++----- include/asm-generic/dma-mapping-common.h | 6 ------ 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index c0a4222bf62..a3933d4330c 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -79,12 +79,29 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, free_pages((unsigned long)vaddr, get_order(size)); } +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + struct dma_map_ops nommu_dma_ops = { - .alloc_coherent = dma_generic_alloc_coherent, - .free_coherent = nommu_free_coherent, - .map_sg = nommu_map_sg, - .map_page = nommu_map_page, - .is_phys = 1, + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = nommu_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, }; void __init no_iommu_init(void) diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index 5406a601185..e694263445f 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -103,7 +103,6 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); - flush_write_buffers(); } static inline void dma_sync_single_for_device(struct device *dev, @@ -116,7 +115,6 @@ static inline void dma_sync_single_for_device(struct device *dev, if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); - flush_write_buffers(); } static inline void dma_sync_single_range_for_cpu(struct device *dev, @@ -132,7 +130,6 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev, ops->sync_single_range_for_cpu(dev, addr, offset, size, dir); debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); - flush_write_buffers(); } else dma_sync_single_for_cpu(dev, addr, size, dir); } @@ -150,7 +147,6 @@ static inline void dma_sync_single_range_for_device(struct device *dev, ops->sync_single_range_for_device(dev, addr, offset, size, dir); debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); - flush_write_buffers(); } else dma_sync_single_for_device(dev, addr, size, dir); } @@ -165,7 +161,6 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); - flush_write_buffers(); } static inline void @@ -179,7 +174,6 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); - flush_write_buffers(); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, NULL) -- cgit v1.2.3-70-g09d2 From c7425314c755d5f94da7c978205c85a7c6201212 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 9 Aug 2009 17:03:52 +0900 Subject: x86: Introduce GDT_ENTRY_INIT(), initialize bad_bios_desc statically Fully initialize bad_bios_desc statically instead of doing some fields statically and some dynamically. Suggested-by: "H. Peter Anvin" Signed-off-by: Akinobu Mita LKML-Reference: <20090809080350.GA4765@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 19 +++++++++---------- drivers/pnp/pnpbios/bioscalls.c | 5 ++--- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index febb2dab254..39a4462ef8a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -403,7 +403,15 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); static const char driver_version[] = "1.16ac"; /* no spaces */ @@ -2331,15 +2339,6 @@ static int __init apm_init(void) } pm_flags |= PM_APM; - /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); - set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); - /* * Set up the long jump entry point to the APM BIOS, which is called * from inline assembly. diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index bd035e3d355..fc83783c3a9 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -60,7 +60,8 @@ do { \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) -static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, 0, 0); +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); /* * At some point we want to use this stack frame pointer to unwind @@ -476,8 +477,6 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.offset = header->fields.pm16offset; pnp_bios_callpoint.segment = PNP_CS16; - set_desc_base(&bad_bios_desc, (unsigned long)__va(0x40UL << 4)); - set_desc_limit(&bad_bios_desc, 4095 - (0x40 << 4)); for_each_possible_cpu(i) { struct desc_struct *gdt = get_cpu_gdt_table(i); if (!gdt) -- cgit v1.2.3-70-g09d2 From 5b7e88edc6193f36941bccbfd5ed9ed5fe27d2e1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:40 +0800 Subject: x86, mce: Support specifying context for software mce injection The cpu context is specified via the new mce.inject_flags fields. This allows more realistic machine check testing in different situations. "RANDOM" context is implemented via NMI broadcasting to add randomization to testing. AK: Fix NMI broadcasting check. Fix 32-bit building. Some race fixes. Move to module. Various changes ChangeLog: v3: - Re-based on latest x86-tip.git/mce4 - Fix 32-bit building v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 11 ++- arch/x86/kernel/cpu/mcheck/mce-inject.c | 156 ++++++++++++++++++++++++++------ 2 files changed, 135 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ad753537291..8945be9ad2b 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,13 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +#define MCJ_CTX_MASK 3 +#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) +#define MCJ_CTX_RANDOM 0 /* inject context: random */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ + /* Fields are zero when not available */ struct mce { __u64 status; @@ -48,8 +55,8 @@ struct mce { __u64 tsc; /* cpu time stamp counter */ __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 pad1; - __u16 pad2; + __u8 inject_flags; /* software inject flags */ + __u16 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f0..ad5d92790eb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include #include #include +#include +#include +#include +#include #include +#include /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,141 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; +static void raise_corrected(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->status & MCI_STATUS_UC) + raise_uncorrected(m, args->regs); + else if (m->status) + raise_corrected(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, }; /* Inject mce on current CPU */ -static void raise_mce(unsigned long data) +static int raise_local(struct mce *m) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; + int context = MCJ_CTX(m->inject_flags); + int ret = 0; int cpu = m->extcpu; - inject_mce(m); if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; - memset(®s, 0, sizeof(struct pt_regs)); - regs.ip = m->ip; - regs.cs = m->cs; printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_uncorrected(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); + } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); + raise_corrected(m); mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); - } - kfree(dm); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(m); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(m); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +211,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } -- cgit v1.2.3-70-g09d2 From 0dcc66851f1091af421416c28a9458836885f522 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:41 +0800 Subject: x86, mce: Support specifying raise mode for software MCE injection Raise mode include raising as exception or raising as poll, it is specified via the mce.inject_flags field. This can be used to specify raise mode of UCNA, which is UC error but raised not as exception. And this can be used to test the filter code of poll handler or exception handler too. For example, enforce a poll raise mode for a fatal MCE. ChangeLog: v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8945be9ad2b..b608a64c581 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -44,6 +44,7 @@ #define MCJ_CTX_PROCESS 1 /* inject context: process */ #define MCJ_CTX_IRQ 2 /* inject context: IRQ */ #define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index ad5d92790eb..7029f0e2aca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -44,7 +44,7 @@ static void inject_mce(struct mce *m) i->finished = 1; } -static void raise_corrected(struct mce *m) +static void raise_poll(struct mce *m) { unsigned long flags; mce_banks_t b; @@ -56,7 +56,7 @@ static void raise_corrected(struct mce *m) m->finished = 0; } -static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +static void raise_exception(struct mce *m, struct pt_regs *pregs) { struct pt_regs regs; unsigned long flags; @@ -85,10 +85,10 @@ static int mce_raise_notify(struct notifier_block *self, if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) return NOTIFY_DONE; cpu_clear(cpu, mce_inject_cpumask); - if (m->status & MCI_STATUS_UC) - raise_uncorrected(m, args->regs); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); else if (m->status) - raise_corrected(m); + raise_poll(m); return NOTIFY_STOP; } @@ -104,7 +104,7 @@ static int raise_local(struct mce *m) int ret = 0; int cpu = m->extcpu; - if (m->status & MCI_STATUS_UC) { + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: @@ -115,7 +115,7 @@ static int raise_local(struct mce *m) */ /*FALL THROUGH*/ case MCJ_CTX_PROCESS: - raise_uncorrected(m, NULL); + raise_exception(m, NULL); break; default: printk(KERN_INFO "Invalid MCE context\n"); @@ -124,7 +124,7 @@ static int raise_local(struct mce *m) printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - raise_corrected(m); + raise_poll(m); mce_notify_irq(); printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); } else -- cgit v1.2.3-70-g09d2 From 5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:42 +0800 Subject: x86, mce: Move debugfs mce dir creating to mce.c Because more debugfs files under mce dir will be create in mce.c. ChangeLog: v5: - Rebased on x86-tip.git/mce Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 1 + arch/x86/kernel/cpu/mcheck/mce-severity.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6bd51e7ba87..32996f9fab6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -22,6 +22,7 @@ struct mce_bank { }; int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 51f7c725dab..bc35a073d15 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -197,7 +197,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1ce6db1f878..9c7419e459d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2003,3 +2004,15 @@ static int __init mcheck_disable(char *str) return 1; } __setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} +#endif -- cgit v1.2.3-70-g09d2 From bf783f9f7d33576815bc89f9f1856a7309ea2f17 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:43 +0800 Subject: x86, mce: Fake panic support for MCE testing If "fake panic" mode is turned on, just log panic message instead of go real panic. This is used for testing only, so that the test suite can check for the correct panic message and do regression testing for MCE would go panic. This patch is based on x86-tip.git/mce. ChangeLog: v5: - Rebased on x86-tip.git/mce v4: - Move config file from sysfs to debugfs Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 75 ++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9c7419e459d..54bd1b2fb4c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -204,6 +204,9 @@ static void print_mce_tail(void) static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { @@ -221,15 +224,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_inc_return(&mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -256,9 +265,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -2015,4 +2027,45 @@ struct dentry *mce_get_debugfs_dir(void) return dmce; } + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mce_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +late_initcall(mce_debugfs_init); #endif -- cgit v1.2.3-70-g09d2 From eeac19a7efa150231e4a6bb110d6f27500bcc8ce Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:13 -0400 Subject: tracing: Map syscall name to number Add a new function to support translating a syscall name to number at runtime. This allows the syscall event tracer to map syscall names to number. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8e9663413b7..afb31d72618 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -500,6 +500,22 @@ struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +int syscall_name_to_nr(char *name) +{ + int i; + + if (!syscalls_metadata) + return -1; + + for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + if (syscalls_metadata[i]) { + if (!strcmp(syscalls_metadata[i]->name, name)) + return i; + } + } + return -1; +} + void arch_init_ftrace_syscalls(void) { int i; -- cgit v1.2.3-70-g09d2 From 066e0378c23f0a3db730893f6a041e4a3922a385 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:23 -0400 Subject: tracing: Call arch_init_ftrace_syscalls at boot Call arch_init_ftrace_syscalls at boot, so we can determine early the set of syscalls for the syscall trace events. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 15 ++++----------- include/trace/syscall.h | 1 - kernel/trace/trace_syscalls.c | 1 - 3 files changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index afb31d72618..0d93d409b8d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -516,31 +516,24 @@ int syscall_name_to_nr(char *name) return -1; } -void arch_init_ftrace_syscalls(void) +static int __init arch_init_ftrace_syscalls(void) { int i; struct syscall_metadata *meta; unsigned long **psys_syscall_table = &sys_call_table; - static atomic_t refs; - - if (atomic_inc_return(&refs) != 1) - goto end; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * FTRACE_SYSCALL_MAX, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return; + return -ENOMEM; } for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } - return; - - /* Paranoid: avoid overflow */ -end: - atomic_dec(&refs); + return 0; } +arch_initcall(arch_init_ftrace_syscalls); #endif diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 8cfe515cbc4..c55fcce4fbb 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -19,7 +19,6 @@ struct syscall_metadata { }; #ifdef CONFIG_FTRACE_SYSCALLS -extern void arch_init_ftrace_syscalls(void); extern struct syscall_metadata *syscall_nr_to_meta(int nr); extern void start_ftrace_syscalls(void); extern void stop_ftrace_syscalls(void); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e579645ac8..08aed439fea 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,7 +106,6 @@ void start_ftrace_syscalls(void) if (++refcount != 1) goto unlock; - arch_init_ftrace_syscalls(); read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { -- cgit v1.2.3-70-g09d2 From a871bd33a6c0bc86fb47cd02ea2650dd43d3d95f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:31 -0400 Subject: tracing: Add syscall tracepoints add two tracepoints in syscall exit and entry path, conditioned on TIF_SYSCALL_FTRACE. Supports the syscall trace event code. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ptrace.c | 7 +++++-- include/trace/syscall.h | 20 ++++++++++++++++++++ kernel/tracepoint.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 09ecbde91c1..34dd6f15185 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,6 +37,9 @@ #include +DEFINE_TRACE(syscall_enter); +DEFINE_TRACE(syscall_exit); + #include "tls.h" enum x86_regset { @@ -1498,7 +1501,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) ret = -1L; if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_enter(regs); + trace_syscall_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1524,7 +1527,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) - ftrace_syscall_exit(regs); + trace_syscall_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/include/trace/syscall.h b/include/trace/syscall.h index c55fcce4fbb..3951d774de1 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -1,8 +1,28 @@ #ifndef _TRACE_SYSCALL_H #define _TRACE_SYSCALL_H +#include + #include + +extern void syscall_regfunc(void); +extern void syscall_unregfunc(void); + +DECLARE_TRACE_WITH_CALLBACK(syscall_enter, + TP_PROTO(struct pt_regs *regs, long id), + TP_ARGS(regs, id), + syscall_regfunc, + syscall_unregfunc +); + +DECLARE_TRACE_WITH_CALLBACK(syscall_exit, + TP_PROTO(struct pt_regs *regs, long ret), + TP_ARGS(regs, ret), + syscall_regfunc, + syscall_unregfunc +); + /* * A syscall entry in the ftrace syscalls array. * diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 1ef5d3a601c..070a42bb892 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -24,6 +24,7 @@ #include #include #include +#include extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -577,3 +578,40 @@ static int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ + +static DEFINE_MUTEX(regfunc_mutex); +static int sys_tracepoint_refcount; + +void syscall_regfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + sys_tracepoint_refcount++; + mutex_unlock(®func_mutex); +} + +void syscall_unregfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + mutex_lock(®func_mutex); + sys_tracepoint_refcount--; + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + mutex_unlock(®func_mutex); +} -- cgit v1.2.3-70-g09d2 From 9daa77e2e9a6b8b859660d5e24d0f8cd77c2af39 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:35 -0400 Subject: tracing: Update FTRACE_SYSCALL_MAX update FTRACE_SYSCALL_MAX to the current number of syscalls FTRACE_SYSCALL_MAX is a temporary solution to get the number of syscalls supported by the arch until we find a more dynamic way to get this number. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index bd2c6511c88..71136545187 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -30,9 +30,9 @@ /* FIXME: I don't want to stay hardcoded */ #ifdef CONFIG_X86_64 -# define FTRACE_SYSCALL_MAX 296 +# define FTRACE_SYSCALL_MAX 299 #else -# define FTRACE_SYSCALL_MAX 333 +# define FTRACE_SYSCALL_MAX 337 #endif #ifdef CONFIG_FUNCTION_TRACER -- cgit v1.2.3-70-g09d2 From 64c12e0444fcc6b75eb49144ba46d43dbdc6bc8f Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:52:53 -0400 Subject: tracing: Add individual syscalls tracepoint id support The current state of syscalls tracepoints generates only one event id for every syscall events. This patch associates an id with each syscall trace event, so that we can identify each syscall trace event using the 'perf' tool. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 10 ++++++++++ include/linux/syscalls.h | 22 ++++++++++++++++++---- include/trace/syscall.h | 8 ++++++++ kernel/trace/trace.h | 6 ------ kernel/trace/trace_syscalls.c | 26 ++++++++++++++++---------- 5 files changed, 52 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0d93d409b8d..3cff1214e17 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -516,6 +516,16 @@ int syscall_name_to_nr(char *name) return -1; } +void set_syscall_enter_id(int num, int id) +{ + syscalls_metadata[num]->enter_id = id; +} + +void set_syscall_exit_id(int num, int id) +{ + syscalls_metadata[num]->exit_id = id; +} + static int __init arch_init_ftrace_syscalls(void) { int i; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5e5b4d33a31..ce4b01c658e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -116,13 +116,20 @@ struct perf_counter_attr; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static struct ftrace_event_call event_enter_##sname; \ + struct trace_event enter_syscall_print_##sname = { \ + .trace = print_syscall_enter, \ + }; \ static int init_enter_##sname(void) \ { \ - int num; \ + int num, id; \ num = syscall_name_to_nr("sys"#sname); \ if (num < 0) \ return -ENOSYS; \ - register_ftrace_event(&event_syscall_enter); \ + id = register_ftrace_event(&enter_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_enter_##sname.id = id; \ + set_syscall_enter_id(num, id); \ INIT_LIST_HEAD(&event_enter_##sname.fields); \ init_preds(&event_enter_##sname); \ return 0; \ @@ -142,13 +149,20 @@ struct perf_counter_attr; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static struct ftrace_event_call event_exit_##sname; \ + struct trace_event exit_syscall_print_##sname = { \ + .trace = print_syscall_exit, \ + }; \ static int init_exit_##sname(void) \ { \ - int num; \ + int num, id; \ num = syscall_name_to_nr("sys"#sname); \ if (num < 0) \ return -ENOSYS; \ - register_ftrace_event(&event_syscall_exit); \ + id = register_ftrace_event(&exit_syscall_print_##sname);\ + if (!id) \ + return -ENODEV; \ + event_exit_##sname.id = id; \ + set_syscall_exit_id(num, id); \ INIT_LIST_HEAD(&event_exit_##sname.fields); \ init_preds(&event_exit_##sname); \ return 0; \ diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 73fb8b4a995..df628404241 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -32,23 +32,31 @@ DECLARE_TRACE_WITH_CALLBACK(syscall_exit, * @nb_args: number of parameters it takes * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) + * @enter_id: associated ftrace enter event id + * @exit_id: associated ftrace exit event id */ struct syscall_metadata { const char *name; int nb_args; const char **types; const char **args; + int enter_id; + int exit_id; }; #ifdef CONFIG_FTRACE_SYSCALLS extern struct syscall_metadata *syscall_nr_to_meta(int nr); extern int syscall_name_to_nr(char *name); +void set_syscall_enter_id(int num, int id); +void set_syscall_exit_id(int num, int id); extern struct trace_event event_syscall_enter; extern struct trace_event event_syscall_exit; extern int reg_event_syscall_enter(void *ptr); extern void unreg_event_syscall_enter(void *ptr); extern int reg_event_syscall_exit(void *ptr); extern void unreg_event_syscall_exit(void *ptr); +enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags); +enum print_line_t print_syscall_exit(struct trace_iterator *iter, int flags); #endif #endif /* _TRACE_SYSCALL_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d682357e4b1..300ef788c97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,8 +34,6 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_HW_BRANCHES, - TRACE_SYSCALL_ENTER, - TRACE_SYSCALL_EXIT, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, TRACE_POWER, @@ -319,10 +317,6 @@ extern void __ftrace_bad_type(void); TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct syscall_trace_enter, \ - TRACE_SYSCALL_ENTER); \ - IF_ASSIGN(var, ent, struct syscall_trace_exit, \ - TRACE_SYSCALL_EXIT); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c7ae25ee95d..e58a9c11ba8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -36,14 +36,18 @@ print_syscall_enter(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int i, ret, syscall; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) goto end; + if (entry->enter_id != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + ret = trace_seq_printf(s, "%s(", entry->name); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -78,16 +82,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags) struct syscall_metadata *entry; int ret; - trace_assign_type(trace, ent); - + trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + if (!entry) { trace_seq_printf(s, "\n"); return TRACE_TYPE_HANDLED; } + if (entry->exit_id != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); if (!ret) @@ -114,7 +122,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, + event = trace_current_buffer_lock_reserve(sys_data->enter_id, size, 0, 0); if (!event) return; @@ -142,7 +150,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) if (!sys_data) return; - event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, + event = trace_current_buffer_lock_reserve(sys_data->exit_id, sizeof(*entry), 0, 0); if (!event) return; @@ -239,10 +247,8 @@ void unreg_event_syscall_exit(void *ptr) struct trace_event event_syscall_enter = { .trace = print_syscall_enter, - .type = TRACE_SYSCALL_ENTER }; struct trace_event event_syscall_exit = { .trace = print_syscall_exit, - .type = TRACE_SYSCALL_EXIT }; -- cgit v1.2.3-70-g09d2 From 0ac676fb50f5f8a22e5e80afc40bf38e31b77c00 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 10 Aug 2009 16:53:11 -0400 Subject: tracing: Convert x86_64 mmap and uname to use DEFINE_SYSCALL A number of syscalls are not using 'DEFINE_SYSCALL'. I'm not sure why. Convert x86_64 uname and mmap to use DEFINE_SYSCALL. Signed-off-by: Jason Baron Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Masami Hiramatsu Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/sys_x86_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 6bc211accf0..45e00eb09c3 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,9 +18,9 @@ #include #include -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) { long error; struct file *file; @@ -226,7 +226,7 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user *name) +SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) { int err; down_read(&uts_sem); -- cgit v1.2.3-70-g09d2 From 81e2d7b30d718824434725a4a24d5864a71b1d30 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 12 Aug 2009 05:45:34 -0700 Subject: x86, intel_txt: tboot.c needs arch/x86/kernel/tboot.c needs . In most configurations that ends up getting implicitly included, but not in all, causing build failures in some configurations. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Joseph Cihula Cc: Shane Wang --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 263591afd29..1ab80120894 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: rename 4k first chunk allocator to page Page size isn't always 4k depending on arch and configuration. Rename 4k first chunk allocator to page. Signed-off-by: Tejun Heo Cc: David Howells --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup_percpu.c | 23 ++++++++++++----------- include/linux/percpu.h | 2 +- mm/percpu.c | 25 ++++++++++++++----------- 4 files changed, 28 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6..12e9eb77ee0 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "4k". + Allowed values are one of "lpage", "embed" and "page". See comments in arch/x86/kernel/setup_percpu.c for details on each allocator. This parameter is primarily for debugging and performance comparison. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a26ff61e2fb..1e17711c29d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k allocator + * Page allocator * - * Boring fallback 4k allocator. This allocator puts more pressure on - * PTE TLBs but other than that behaves nicely on both UMA and NUMA. + * Boring fallback 4k page allocator. This allocator puts more + * pressure on PTE TLBs but other than that behaves nicely on both UMA + * and NUMA. */ -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) +static ssize_t __init setup_pcpu_page(size_t static_size) { - return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpu4k_populate_pte); + return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); } /* for explicit first chunk allocator selection */ @@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void) */ ret = -EINVAL; if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { + if (strcmp(pcpu_chosen_alloc, "page")) { if (!strcmp(pcpu_chosen_alloc, "lpage")) ret = setup_pcpu_lpage(static_size, true); else if (!strcmp(pcpu_chosen_alloc, "embed")) @@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void) "specified\n", pcpu_chosen_alloc); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", + "falling back to page size\n", pcpu_chosen_alloc, ret); } } else { @@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void) ret = setup_pcpu_embed(static_size, false); } if (ret < 0) - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_page(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e134c822963..7989f61b03f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); -extern ssize_t __init pcpu_4k_first_chunk( +extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, diff --git a/mm/percpu.c b/mm/percpu.c index cbddcbdab68..6feac793490 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /** - * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * - * This is a helper to ease setting up embedded first percpu chunk and - * can be called where pcpu_setup_first_chunk() is expected. + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. @@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; @@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, int i, j; ssize_t ret; + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, PCPU_MIN_UNIT_SIZE)); @@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); goto enomem; } pages[j++] = virt_to_page(ptr); @@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n", - unit_pages, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr, NULL); -- cgit v1.2.3-70-g09d2 From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: percpu: build first chunk allocators selectively There's no need to build unused first chunk allocators in. Define CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them selectively. Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 10 ++++++++++ include/linux/percpu.h | 27 +++++---------------------- mm/percpu.c | 19 +++++++++++-------- 3 files changed, 26 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e06b2eeff9f..f7ac2721551 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_LPAGE_FIRST_CHUNK + def_bool y + depends on NEED_MULTIPLE_NODES + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 7989f61b03f..e26788e0da4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk( ssize_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( size_t static_size, size_t reserved_size, ssize_t *dyn_sizep, size_t *unit_sizep, @@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk( extern void *pcpu_lpage_remapped(void *kaddr); #else -static inline int pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - return -EINVAL; -} - -static inline ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - return -EINVAL; -} - static inline void *pcpu_lpage_remapped(void *kaddr) { return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index 6feac793490..7971997de31 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } -static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep) +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) { size_t size_sum; @@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); } +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes @@ -1598,12 +1604,9 @@ out_free_ar: free_bootmem(__pa(pages), pages_size); return ret; } +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -/* - * Large page remapping first chunk setup helper - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES - +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping * @static_size: the size of static percpu area in bytes @@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } -#endif +#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ /* * Generic percpu area setup. -- cgit v1.2.3-70-g09d2 From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: generalize first chunk allocator selection Now that all first chunk allocators are in mm/percpu.c, it makes sense to make generalize percpu_alloc kernel parameter. Define PCPU_FC_* and set pcpu_chosen_fc using early_param() in mm/percpu.c. Arch code can use the set value to determine which first chunk allocator to use. Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 11 ++++++----- arch/x86/kernel/setup_percpu.c | 24 ++++++------------------ include/linux/percpu.h | 12 ++++++++++++ mm/percpu.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 12e9eb77ee0..dee9ce2e6cf 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "page". - See comments in arch/x86/kernel/setup_percpu.c for - details on each allocator. This parameter is primarily - for debugging and performance comparison. + percpu_alloc= Select which percpu first chunk allocator to use. + Currently supported values are "embed", "page" and + "lpage". Archs may support subset or none of the + selections. See comments in mm/percpu.c for details + on each allocator. This parameter is primarily for + debugging and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1e17711c29d..b961d99e641 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size) pcpup_populate_pte); } -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void) * each allocator for details. */ ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "page")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) + if (pcpu_chosen_fc != PCPU_FC_AUTO) { + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + if (pcpu_chosen_fc == PCPU_FC_LPAGE) ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); + ret = setup_pcpu_embed(static_size, true); + if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " "falling back to page size\n", - pcpu_chosen_alloc, ret); + pcpu_fc_names[pcpu_chosen_fc], ret); } } else { ret = setup_pcpu_lpage(static_size, false); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e26788e0da4..9be05cbe5ee 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,18 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +enum pcpu_fc { + PCPU_FC_AUTO, + PCPU_FC_EMBED, + PCPU_FC_PAGE, + PCPU_FC_LPAGE, + + PCPU_FC_NR, +}; +extern const char *pcpu_fc_names[PCPU_FC_NR]; + +extern enum pcpu_fc pcpu_chosen_fc; + typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); diff --git a/mm/percpu.c b/mm/percpu.c index 7971997de31..7fb40bb1555 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", + [PCPU_FC_LPAGE] = "lpage", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK + else if (!strcmp(str, "lpage")) + pcpu_chosen_fc = PCPU_FC_LPAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) -- cgit v1.2.3-70-g09d2 From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: drop @static_size from first chunk allocators First chunk allocators assume percpu areas have been linked using one of PERCPU_*() macros and depend on __per_cpu_load symbol defined by those macros, so there isn't much point in passing in static area size explicitly when it can be easily calculated from __per_cpu_start and __per_cpu_end. Drop @static_size from all percpu first chunk allocators and helpers. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++------------------- include/linux/percpu.h | 18 ++++++++---------- mm/percpu.c | 29 +++++++++++++---------------- 3 files changed, 36 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b961d99e641..8aad486c688 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; @@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -ENOMEM; } - ret = pcpu_lpage_build_unit_map(static_size, - PERCPU_FIRST_CHUNK_RESERVE, + ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, &dyn_size, &unit_size, PMD_SIZE, unit_map, pcpu_lpage_cpu_distance); if (ret < 0) { @@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) } } - ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - dyn_size, unit_size, PMD_SIZE, - unit_map, nr_units, + ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + unit_size, PMD_SIZE, unit_map, nr_units, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: if (ret < 0) @@ -218,7 +216,7 @@ out_free: return ret; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, reserve - PERCPU_FIRST_CHUNK_RESERVE); } @@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(size_t static_size) +static ssize_t __init setup_pcpu_page(void) { - return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); } @@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; @@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(static_size, true); + ret = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(static_size, true); + ret = setup_pcpu_embed(true); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " @@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void) pcpu_fc_names[pcpu_chosen_fc], ret); } } else { - ret = setup_pcpu_lpage(static_size, false); + ret = setup_pcpu_lpage(false); if (ret < 0) - ret = setup_pcpu_embed(static_size, false); + ret = setup_pcpu_embed(false); } if (ret < 0) - ret = setup_pcpu_page(static_size); + ret = setup_pcpu_page(); if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); + panic("cannot initialize percpu area (err=%zd)", ret); pcpu_unit_size = ret; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9be05cbe5ee..be2fc8fb9b6 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk( #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( - size_t static_size, size_t reserved_size, - ssize_t dyn_size); + size_t reserved_size, ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( - size_t static_size, size_t reserved_size, + size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); @@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); extern ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 7fb40bb1555..e2ac58a39bb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size) +ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t size_sum, unit_size, chunk_size; void *base; unsigned int cpu; @@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE @@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, +ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1641,7 +1640,6 @@ out_free_ar: #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_sizep: in/out parameter for dynamic size, -1 for auto * @unit_sizep: out parameter for unit size @@ -1661,13 +1659,14 @@ out_free_ar: * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and * returns the number of units to be allocated. -errno on failure. */ -int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; int group_cnt_max = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ @@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes @@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, +ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; @@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; ssize_t unit_size; unsigned long delta; unsigned int cpu; @@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-70-g09d2 From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: percpu: add @align to pcpu_fc_alloc_fn_t pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align parameter. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 ++-- include/linux/percpu.h | 3 ++- mm/percpu.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8aad486c688..660cde13314 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return pcpu_alloc_bootmem(cpu, size, size); + return pcpu_alloc_bootmem(cpu, size, align); } static void __init pcpu_fc_free(void *ptr, size_t size) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 0cfdd14d096..d385dbcf190 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, + size_t align); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); diff --git a/mm/percpu.c b/mm/percpu.c index 287f59cc5fb..3316e3aac7e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate %s page " "for cpu%u\n", psize_str, cpu); @@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, goto found; continue; found: - ptr = alloc_fn(cpu, lpage_size); + ptr = alloc_fn(cpu, lpage_size, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " "for cpu%u\n", cpu); -- cgit v1.2.3-70-g09d2 From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: introduce pcpu_alloc_info and pcpu_group_info Till now, non-linear cpu->unit map was expressed using an integer array which maps each cpu to a unit and used only by lpage allocator. Although how many units have been placed in a single contiguos area (group) is known while building unit_map, the information is lost when the result is recorded into the unit_map array. For lpage allocator, as all allocations are done by lpages and whether two adjacent lpages are in the same group or not is irrelevant, this didn't cause any problem. Non-linear cpu->unit mapping will be used for sparse embedding and this grouping information is necessary for that. This patch introduces pcpu_alloc_info which contains all the information necessary for initializing percpu allocator. pcpu_alloc_info contains array of pcpu_group_info which describes how units are grouped and mapped to cpus. pcpu_group_info also has base_offset field to specify its offset from the chunk's base address. pcpu_build_alloc_info() initializes this field as if all groups are allocated back-to-back as is currently done but this will be used to sparsely place groups. pcpu_alloc_info is a rather complex data structure which contains a flexible array which in turn points to nested cpu_map arrays. * pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to help dealing with pcpu_alloc_info. * pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info, generalized and renamed to pcpu_build_alloc_info(). @cpu_distance_fn may be NULL indicating that all cpus are of LOCAL_DISTANCE. * pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info, generalized and renamed to pcpu_dump_alloc_info(). It now also prints which group each alloc unit belongs to. * pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the separate parameters. All first chunk allocators are updated to use pcpu_build_alloc_info() to build alloc_info and call pcpu_setup_first_chunk() with it. This has the side effect of packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4. * x86 setup_pcpu_lpage() is updated to deal with alloc_info. * sparc64 setup_per_cpu_areas() is updated to build alloc_info. Although the changes made by this patch are pretty pervasive, it doesn't cause any behavior difference other than packing of sparse cpus. It mostly changes how information is passed among initialization functions and makes room for more flexibility. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 24 +- arch/x86/kernel/setup_percpu.c | 38 ++- include/linux/percpu.h | 42 +++- mm/percpu.c | 529 +++++++++++++++++++++++++---------------- 4 files changed, 389 insertions(+), 244 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 9856d866b77..a42a4a744d1 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end, void __init setup_per_cpu_areas(void) { - size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; + struct pcpu_alloc_info *ai; unsigned long delta, cpu; size_t size_sum, pcpu_unit_size; size_t ptrs_size; void **ptrs; - size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); + + ai->static_size = __per_cpu_end - __per_cpu_start; + ai->reserved_size = PERCPU_MODULE_RESERVE; + + size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size + PERCPU_DYNAMIC_RESERVE); - dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; + ai->dyn_size = size_sum - ai->static_size - ai->reserved_size; + ai->unit_size = PCPU_CHUNK_SIZE; + ai->atom_size = PCPU_CHUNK_SIZE; + ai->alloc_size = PCPU_CHUNK_SIZE; + ai->groups[0].nr_units = nr_cpu_ids; + + for_each_possible_cpu(cpu) + ai->groups[0].cpu_map[cpu] = cpu; ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0])); ptrs = alloc_bootmem(ptrs_size); @@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void) free_bootmem(__pa(ptrs[cpu] + size_sum), PCPU_CHUNK_SIZE - size_sum); - memcpy(ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, ai->static_size); } /* allocate address and map */ @@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(static_size, - PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 660cde13314..db5f9c49fec 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - size_t unit_map_size, unit_size; - int *unit_map; - int nr_units; + struct pcpu_alloc_info *ai; ssize_t ret; /* on non-NUMA, embedding is better */ @@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } /* allocate and build unit_map */ - unit_map_size = nr_cpu_ids * sizeof(int); - unit_map = alloc_bootmem_nopanic(unit_map_size); - if (!unit_map) { - pr_warning("PERCPU: failed to allocate unit_map\n"); - return -ENOMEM; + ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, pcpu_lpage_cpu_distance); + if (IS_ERR(ai)) { + pr_warning("PERCPU: failed to build unit_map (%ld)\n", + PTR_ERR(ai)); + return PTR_ERR(ai); } - ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, - &dyn_size, &unit_size, PMD_SIZE, - unit_map, pcpu_lpage_cpu_distance); - if (ret < 0) { - pr_warning("PERCPU: failed to build unit_map\n"); - goto out_free; - } - nr_units = ret; - /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_units * unit_size; + size_t tot_size = 0; + int group; + + for (group = 0; group < ai->nr_groups; group++) + tot_size += ai->unit_size * ai->groups[group].nr_units; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { @@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } } - ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - unit_size, PMD_SIZE, unit_map, nr_units, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, + pcpul_map); out_free: - if (ret < 0) - free_bootmem(__pa(unit_map), unit_map_size); + pcpu_free_alloc_info(ai); return ret; } #else diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 570fb18de2b..77b86be8ce4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,25 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +struct pcpu_group_info { + int nr_units; /* aligned # of units */ + unsigned long base_offset; /* base address offset */ + unsigned int *cpu_map; /* unit->cpu map, empty + * entries contain NR_CPUS */ +}; + +struct pcpu_alloc_info { + size_t static_size; + size_t reserved_size; + size_t dyn_size; + size_t unit_size; + size_t atom_size; + size_t alloc_size; + size_t __ai_size; /* internal, don't use */ + int nr_groups; /* 0 if grouping unnecessary */ + struct pcpu_group_info groups[]; +}; + enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, @@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, +extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units); +extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); + +extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -#endif -extern size_t __init pcpu_setup_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map); +extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( @@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern ssize_t __init pcpu_lpage_first_chunk( - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, + const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 2b9c4b2a2fc..99f7fa68272 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include #include @@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, return size_sum; } -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. * * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; + int last_allocs, group, unit; unsigned int cpu, tcpu; - int group, unit; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; /* * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest + * alloc_size is multiple of atom_size and is the smallest * which can accomodate 4k aligned segments which are equal to * or larger than min_unit_size. */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - alloc_size = roundup(min_unit_size, lpage_size); + alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) upa--; @@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; - if (group_map[tcpu] == group && + if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; + nr_groups = max(nr_groups, group + 1); goto next_group; } } @@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) continue; - for (group = 0; group_cnt[group]; group++) { + for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; @@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, last_allocs = allocs; best_upa = upa; } - *unit_sizep = alloc_size / best_upa; + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { for_each_possible_cpu(cpu) if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; } + BUG_ON(unit != nr_units); - return unit; /* unit contains aligned number of units */ + return ai; } -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, - unsigned int *cpup); - -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) { - int width = 1, v = nr_units; + int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + v = num_possible_cpus(); while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); } printk("\n"); } -#endif /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address - * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area * setup path. * - * @reserved_size, if non-zero, specifies the amount of bytes to + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size determines the number of bytes available for dynamic - * allocation in the first chunk. The area between @static_size + - * @reserved_size + @dyn_size and @unit_size is unused. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size specifies unit size and must be aligned to PAGE_SIZE and - * equal to or larger than @static_size + @reserved_size + if - * non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. @@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map) +size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + dyn_size; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu, tcpu; - int i; + unsigned int cpu; + int *unit_map; + int group, unit, i; /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); + BUG_ON(ai->nr_groups <= 0); + BUG_ON(!ai->static_size); BUG_ON(!base_addr); - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + BUG_ON(ai->unit_size < size_sum); + BUG_ON(ai->unit_size & ~PAGE_MASK); + BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + pcpu_dump_alloc_info(KERN_DEBUG, ai); /* determine number of units and verify and initialize pcpu_unit_map */ - if (unit_map) { - int first_unit = INT_MAX, last_unit = INT_MIN; - - for_each_possible_cpu(cpu) { - int unit = unit_map[cpu]; - - BUG_ON(unit < 0); - for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; - /* the mapping should be one-to-one */ - BUG_ON(unit_map[tcpu] == unit); - } + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - if (unit < first_unit) { - pcpu_first_unit_cpu = cpu; - first_unit = unit; - } - if (unit > last_unit) { - pcpu_last_unit_cpu = cpu; - last_unit = unit; - } - } - pcpu_nr_units = last_unit + 1; - pcpu_unit_map = unit_map; - } else { - int *identity_map; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = NR_CPUS; + pcpu_first_unit_cpu = NR_CPUS; - /* #units == #cpus, identity mapped */ - identity_map = alloc_bootmem(nr_cpu_ids * - sizeof(identity_map[0])); + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; - for_each_possible_cpu(cpu) - identity_map[cpu] = cpu; + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_first_unit_cpu = 0; - pcpu_last_unit_cpu = pcpu_nr_units - 1; - pcpu_nr_units = nr_cpu_ids; - pcpu_unit_map = identity_map; + BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); + BUG_ON(unit_map[cpu] != NR_CPUS); + + unit_map[cpu] = unit + i; + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + BUG_ON(unit_map[cpu] == NR_CPUS); + + pcpu_unit_map = unit_map; /* determine basic parameters */ - pcpu_unit_pages = unit_size >> PAGE_SHIFT; + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + @@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; @@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup); */ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t size_sum, unit_size, chunk_size; + struct pcpu_alloc_info *ai; + size_t size_sum, chunk_size; void *base; - unsigned int cpu; + int unit; + ssize_t ret; - /* determine parameters and allocate */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - chunk_size = unit_size * nr_cpu_ids; + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + chunk_size = ai->unit_size * num_possible_cpus(); base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - return -ENOMEM; + ret = -ENOMEM; + goto out_free_ai; } /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = base + cpu * unit_size; - - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + size_sum), - unit_size - size_sum); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), unit_size); + for (unit = 0; unit < num_possible_cpus(); unit++) { + void *ptr = base + unit * ai->unit_size; + + free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); + memcpy(ptr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size, - unit_size); + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); - return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base, NULL); + ret = pcpu_setup_first_chunk(ai, base); +out_free_ai: + pcpu_free_alloc_info(ai); + return ret; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - ssize_t dyn_size = -1; - size_t size_sum, unit_size; + struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; - unsigned int cpu; - int i, j; + int unit, i, j; ssize_t ret; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - unit_pages = unit_size >> PAGE_SHIFT; + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; - for_each_possible_cpu(cpu) + for (unit = 0; unit < num_possible_cpus(); unit++) for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); @@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_size; + vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); - for_each_possible_cpu(cpu) { + for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = - (unsigned long)vm.addr + cpu * unit_size; + (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, */ /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, static_size); + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size, - dyn_size); + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -1790,6 +1905,7 @@ enomem: ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); return ret; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size; static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, +static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, unsigned int *cpup) { - unsigned int cpu; + int group, cunit; - for_each_possible_cpu(cpu) - if (unit_map[cpu] == unit) { + for (group = 0, cunit = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + if (unit < cunit + gi->nr_units) { if (cpup) - *cpup = cpu; + *cpup = gi->cpu_map[unit - cunit]; return true; } + cunit += gi->nr_units; + } return false; } +static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) +{ + int group, unit, i; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + for (i = 0; i < gi->nr_units; i++) + if (gi->cpu_map[i] == cpu) + return unit + i; + } + BUG(); +} + /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes - * @lpage_size: the size of a large page - * @unit_map: cpu -> unit mapping - * @nr_units: the number of units + * @ai: pcpu_alloc_info * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should always specify @dyn_size - * and @unit_size. These parameters along with @unit_map and - * @nr_units can be determined using pcpu_lpage_build_unit_map(). - * This two stage initialization is to allow arch code to evaluate the + * Unlike other helpers, the caller should provide fully initialized + * @ai. This can be done using pcpu_build_alloc_info(). This two + * stage initialization is to allow arch code to evaluate the * parameters before committing to it. * * Large pages are allocated as directed by @unit_map and other @@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, +ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t chunk_size = unit_size * nr_units; - size_t map_size; + const size_t lpage_size = ai->atom_size; + size_t chunk_size, map_size; unsigned int cpu; ssize_t ret; - int i, j, unit; + int i, j, unit, nr_units; - pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, - unit_size, lpage_size, unit_map, nr_units); + nr_units = 0; + for (i = 0; i < ai->nr_groups; i++) + nr_units += ai->groups[i].nr_units; + chunk_size = ai->unit_size * nr_units; BUG_ON(chunk_size % lpage_size); - pcpul_size = static_size + reserved_size + dyn_size; + pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; pcpul_lpage_size = lpage_size; pcpul_nr_lpages = chunk_size / lpage_size; @@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate all pages */ for (i = 0; i < pcpul_nr_lpages; i++) { size_t offset = i * lpage_size; - int first_unit = offset / unit_size; - int last_unit = (offset + lpage_size - 1) / unit_size; + int first_unit = offset / ai->unit_size; + int last_unit = (offset + lpage_size - 1) / ai->unit_size; void *ptr; /* find out which cpu is mapped to this unit */ for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + if (pcpul_unit_to_cpu(unit, ai, &cpu)) goto found; continue; found: @@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* return unused holes */ for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * unit_size; - size_t end = start + unit_size; + size_t start = unit * ai->unit_size; + size_t end = start + ai->unit_size; size_t off, next; /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + if (pcpul_unit_to_cpu(unit, ai, NULL)) start += pcpul_size; /* unit can span more than one page, punch the holes */ @@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate address, map and copy */ vm.flags = VM_ALLOC; vm.size = chunk_size; - vm_area_register_early(&vm, unit_size); + vm_area_register_early(&vm, ai->unit_size); for (i = 0; i < pcpul_nr_lpages; i++) { if (!pcpul_map[i].ptr) @@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, } for_each_possible_cpu(cpu) - memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, - static_size); + memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, + __per_cpu_load, ai->static_size); /* we're ready, commit */ pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, static_size, reserved_size, dyn_size, unit_size); + vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, unit_map); + ret = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped -- cgit v1.2.3-70-g09d2 From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: percpu: add pcpu_unit_offsets[] Currently units are mapped sequentially into address space. This patch adds pcpu_unit_offsets[] which allows units to be mapped to arbitrary offsets from the chunk base address. This is necessary to allow sparse embedding which might would need to allocate address ranges and memory areas which aren't aligned to unit size but allocation atom size (page or large page size). This also simplifies things a bit by removing the need to calculate offset from unit number. With this change, there's no need for the arch code to know pcpu_unit_size. Update pcpu_setup_first_chunk() and first chunk allocators to return regular 0 or -errno return code instead of unit size or -errno. Signed-off-by: Tejun Heo Cc: David S. Miller --- arch/sparc/kernel/smp_64.c | 12 +++--- arch/x86/kernel/setup_percpu.c | 51 ++++++++++------------- include/linux/percpu.h | 16 ++++--- mm/percpu.c | 95 +++++++++++++++++++++--------------------- 4 files changed, 84 insertions(+), 90 deletions(-) (limited to 'arch/x86') diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a42a4a744d1..b03fd362c62 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void) static struct vm_struct vm; struct pcpu_alloc_info *ai; unsigned long delta, cpu; - size_t size_sum, pcpu_unit_size; + size_t size_sum; size_t ptrs_size; void **ptrs; + int rc; ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); @@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); + if (rc) + panic("failed to setup percpu first chunk (%d)", rc); free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { - __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - } + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; /* Setup %g5 for the boot cpu. */ __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index db5f9c49fec..9becc5d4b51 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; struct pcpu_alloc_info *ai; - ssize_t ret; + int rc; /* on non-NUMA, embedding is better */ if (!chosen && !pcpu_need_numa()) @@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - ret = -EINVAL; + rc = -EINVAL; goto out_free; } } - ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, - pcpul_map); + rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: pcpu_free_alloc_info(ai); - return ret; + return rc; } #else -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(bool chosen) +static int __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(void) +static int __init setup_pcpu_page(void) { return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, @@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); @@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = -EINVAL; + rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(true); + rc = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(true); + rc = setup_pcpu_embed(true); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], ret); + pcpu_fc_names[pcpu_chosen_fc], rc); } } else { - ret = setup_pcpu_lpage(false); - if (ret < 0) - ret = setup_pcpu_embed(false); + rc = setup_pcpu_lpage(false); + if (rc < 0) + rc = setup_pcpu_embed(false); } - if (ret < 0) - ret = setup_pcpu_page(); - if (ret < 0) - panic("cannot initialize percpu area (err=%zd)", ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = setup_pcpu_page(); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = - delta + pcpu_unit_map[cpu] * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 77b86be8ce4..a7ec840f596 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,7 +57,7 @@ #endif extern void *pcpu_base_addr; -extern const int *pcpu_unit_map; +extern const unsigned long *pcpu_unit_offsets; struct pcpu_group_info { int nr_units; /* aligned # of units */ @@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr); +extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern ssize_t __init pcpu_embed_first_chunk( - size_t reserved_size, ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, + ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -extern ssize_t __init pcpu_page_first_chunk( - size_t reserved_size, +extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern ssize_t __init pcpu_lpage_first_chunk( - const struct pcpu_alloc_info *ai, +extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 99f7fa68272..653b02c4020 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* cpu -> unit map */ -const int *pcpu_unit_map __read_mostly; +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ /* * The first chunk which always exists. Note that unlike other @@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, @@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; + addr += pcpu_unit_offsets[smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl, * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; @@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_dump_alloc_info(KERN_DEBUG, ai); - /* determine number of units and verify and initialize pcpu_unit_map */ + /* determine number of units and initialize unit_map and base */ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = NR_CPUS; @@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] != NR_CPUS); unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; } @@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] == NR_CPUS); pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = schunk->vm->addr; - return pcpu_unit_size; + return 0; } const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { @@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup); * size, the leftover is returned to the bootmem allocator. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { struct pcpu_alloc_info *ai; size_t size_sum, chunk_size; void *base; int unit; - ssize_t ret; + int rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); if (IS_ERR(ai)) @@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - ret = -ENOMEM; + rc = -ENOMEM; goto out_free_ai; } @@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, base); + rc = pcpu_setup_first_chunk(ai, base); out_free_ai: pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1813,13 +1816,12 @@ out_free_ai: * page-by-page into vmalloc area. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j; - ssize_t ret; + int unit, i, j, rc; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], - unit_pages); - if (ret < 0) - panic("failed to map percpu area, err=%zd\n", ret); + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); /* * FIXME: Archs with virtual cache should flush local @@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); - ret = -ENOMEM; + rc = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) * pcpu_lpage_remapped(). * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) +int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; const size_t lpage_size = ai->atom_size; size_t chunk_size, map_size; unsigned int cpu; - ssize_t ret; - int i, j, unit, nr_units; + int i, j, unit, nr_units, rc; nr_units = 0; for (i = 0; i < ai->nr_groups; i++) @@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped @@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) pcpul_nr_lpages--; - return ret; + return rc; enomem: for (i = 0; i < pcpul_nr_lpages; i++) @@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - ssize_t unit_size; unsigned long delta; unsigned int cpu; + int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); - if (unit_size < 0) + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE); + if (rc < 0) panic("Failed to initialized percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) - __per_cpu_offset[cpu] = delta + cpu * unit_size; + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ -- cgit v1.2.3-70-g09d2 From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: percpu: update embedding first chunk allocator to handle sparse units Now that percpu core can handle very sparse units, given that vmalloc space is large enough, embedding first chunk allocator can use any memory to build the first chunk. This patch teaches pcpu_embed_first_chunk() about distances between cpus and to use alloc/free callbacks to allocate node specific areas for each group and use them for the first chunk. This brings the benefits of embedding allocator to NUMA configurations - no extra TLB pressure with the flexibility of unified dynamic allocator and no need to restructure arch code to build memory layout suitable for percpu. With units put into atom_size aligned groups according to cpu distances, using large page for dynamic chunks is also easily possible with falling back to reuglar pages if large allocation fails. Embedding allocator users are converted to specify NULL cpu_distance_fn, so this patch doesn't cause any visible behavior difference. Following patches will convert them. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +- include/linux/percpu.h | 7 ++- mm/percpu.c | 113 +++++++++++++++++++++++++++++++---------- 3 files changed, 93 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9becc5d4b51..67f6314de9f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen) return -EINVAL; return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE); + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PAGE_SIZE, NULL, pcpu_fc_alloc, + pcpu_fc_free); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a7ec840f596..25359932740 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern int __init pcpu_embed_first_chunk(size_t reserved_size, - ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index cc9c4c64606..c2826d05505 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup); * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than * specified to fill page alignment. When @dyn_size is auto, @@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup); * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { + void *base = (void *)ULONG_MAX; + void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, chunk_size; - void *base; - int unit; - int rc; + size_t size_sum, areas_size; + int group, i, rc; - ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); - BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; - chunk_size = ai->unit_size * num_possible_cpus(); + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!base) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { rc = -ENOMEM; - goto out_free_ai; + goto out_free; } - /* return the leftover and copy */ - for (unit = 0; unit < num_possible_cpus(); unit++) { - void *ptr = base + unit * ai->unit_size; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; - free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); - memcpy(ptr, __per_cpu_load, ai->static_size); + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* we're ready, commit */ + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) + ai->groups[group].base_offset = areas[group] - base; + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); -out_free_ai: + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || @@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialized percpu areas."); -- cgit v1.2.3-70-g09d2 From 4518e6a0c038b98be4c480e6f4481e8676bd15dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Andi Kleen --- arch/x86/Kconfig | 4 -- arch/x86/kernel/setup_percpu.c | 155 ++++++++--------------------------------- 2 files changed, 28 insertions(+), 131 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f7ac2721551..869d7d30144 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config NEED_PER_CPU_LPAGE_FIRST_CHUNK - def_bool y - depends on NEED_MULTIPLE_NODES - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 67f6314de9f..d559af913e1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size) free_bootmem(__pa(ptr), size); } -/* - * Large page remapping allocator - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES -static void __init pcpul_map(void *ptr, size_t size, void *addr) -{ - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)addr); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); -} - -static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { +#ifdef CONFIG_NEED_MULTIPLE_NODES if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; -} - -static int __init setup_pcpu_lpage(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - struct pcpu_alloc_info *ai; - int rc; - - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* allocate and build unit_map */ - ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpu_lpage_cpu_distance); - if (IS_ERR(ai)) { - pr_warning("PERCPU: failed to build unit_map (%ld)\n", - PTR_ERR(ai)); - return PTR_ERR(ai); - } - - /* do the parameters look okay? */ - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = 0; - int group; - - for (group = 0; group < ai->nr_groups; group++) - tot_size += ai->unit_size * ai->groups[group].nr_units; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - rc = -EINVAL; - goto out_free; - } - } - - rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); -out_free: - pcpu_free_alloc_info(ai); - return rc; -} #else -static int __init setup_pcpu_lpage(bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static int __init setup_pcpu_embed(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PAGE_SIZE, NULL, pcpu_fc_alloc, - pcpu_fc_free); } -/* - * Page allocator - * - * Boring fallback 4k page allocator. This allocator puts more - * pressure on PTE TLBs but other than that behaves nicely on both UMA - * and NUMA. - */ static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static int __init setup_pcpu_page(void) -{ - return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif rc = -EINVAL; - if (pcpu_chosen_fc != PCPU_FC_AUTO) { - if (pcpu_chosen_fc != PCPU_FC_PAGE) { - if (pcpu_chosen_fc == PCPU_FC_LPAGE) - rc = setup_pcpu_lpage(true); - else - rc = setup_pcpu_embed(true); - - if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); - } - } else { - rc = setup_pcpu_lpage(false); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - rc = setup_pcpu_embed(false); + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) - rc = setup_pcpu_page(); + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); -- cgit v1.2.3-70-g09d2 From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: percpu: kill lpage first chunk allocator With x86 converted to embedding allocator, lpage doesn't have any user left. Kill it along with cpa handling code. Signed-off-by: Tejun Heo Cc: Jan Beulich --- Documentation/kernel-parameters.txt | 10 +- arch/x86/mm/pageattr.c | 20 +-- include/linux/percpu.h | 16 --- mm/percpu.c | 241 ------------------------------------ 4 files changed, 6 insertions(+), 281 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dee9ce2e6cf..e710093e3d3 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= Select which percpu first chunk allocator to use. - Currently supported values are "embed", "page" and - "lpage". Archs may support subset or none of the - selections. See comments in mm/percpu.c for details - on each allocator. This parameter is primarily for - debugging and performance comparison. + Currently supported values are "embed" and "page". + Archs may support subset or none of the selections. + See comments in mm/percpu.c for details on each + allocator. This parameter is primarily for debugging + and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dce282f6570..f53cfc7f963 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 25359932740..878836ca999 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -82,7 +82,6 @@ enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, PCPU_FC_PAGE, - PCPU_FC_LPAGE, PCPU_FC_NR, }; @@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); -typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units); @@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn); - -extern void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index c2826d05505..77933928107 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", - [PCPU_FC_LPAGE] = "lpage", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; @@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; -#endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK - else if (!strcmp(str, "lpage")) - pcpu_chosen_fc = PCPU_FC_LPAGE; #endif else pr_warning("PERCPU: unknown allocator %s specified\n", str); @@ -1970,242 +1965,6 @@ out_free_ar: } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -struct pcpul_ent { - void *ptr; - void *map_addr; -}; - -static size_t pcpul_size; -static size_t pcpul_lpage_size; -static int pcpul_nr_lpages; -static struct pcpul_ent *pcpul_map; - -static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, - unsigned int *cpup) -{ - int group, cunit; - - for (group = 0, cunit = 0; group < ai->nr_groups; group++) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - if (unit < cunit + gi->nr_units) { - if (cpup) - *cpup = gi->cpu_map[unit - cunit]; - return true; - } - cunit += gi->nr_units; - } - - return false; -} - -static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) -{ - int group, unit, i; - - for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - for (i = 0; i < gi->nr_units; i++) - if (gi->cpu_map[i] == cpu) - return unit + i; - } - BUG(); -} - -/** - * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @ai: pcpu_alloc_info - * @alloc_fn: function to allocate percpu lpage, always called with lpage_size - * @free_fn: function to free percpu memory, @size <= lpage_size - * @map_fn: function to map percpu lpage, always called with lpage_size - * - * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should provide fully initialized - * @ai. This can be done using pcpu_build_alloc_info(). This two - * stage initialization is to allow arch code to evaluate the - * parameters before committing to it. - * - * Large pages are allocated as directed by @unit_map and other - * parameters and mapped to vmalloc space. Unused holes are returned - * to the page allocator. Note that these holes end up being actively - * mapped twice - once to the physical mapping and to the vmalloc area - * for the first percpu chunk. Depending on architecture, this might - * cause problem when changing page attributes of the returned area. - * These double mapped areas can be detected using - * pcpu_lpage_remapped(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - static struct vm_struct vm; - const size_t lpage_size = ai->atom_size; - size_t chunk_size, map_size; - unsigned int cpu; - int i, j, unit, nr_units, rc; - - nr_units = 0; - for (i = 0; i < ai->nr_groups; i++) - nr_units += ai->groups[i].nr_units; - - chunk_size = ai->unit_size * nr_units; - BUG_ON(chunk_size % lpage_size); - - pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; - pcpul_lpage_size = lpage_size; - pcpul_nr_lpages = chunk_size / lpage_size; - - /* allocate pointer array and alloc large pages */ - map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); - pcpul_map = alloc_bootmem(map_size); - - /* allocate all pages */ - for (i = 0; i < pcpul_nr_lpages; i++) { - size_t offset = i * lpage_size; - int first_unit = offset / ai->unit_size; - int last_unit = (offset + lpage_size - 1) / ai->unit_size; - void *ptr; - - /* find out which cpu is mapped to this unit */ - for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, ai, &cpu)) - goto found; - continue; - found: - ptr = alloc_fn(cpu, lpage_size, lpage_size); - if (!ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - pcpul_map[i].ptr = ptr; - } - - /* return unused holes */ - for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * ai->unit_size; - size_t end = start + ai->unit_size; - size_t off, next; - - /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, ai, NULL)) - start += pcpul_size; - - /* unit can span more than one page, punch the holes */ - for (off = start; off < end; off = next) { - void *ptr = pcpul_map[off / lpage_size].ptr; - next = min(roundup(off + 1, lpage_size), end); - if (ptr) - free_fn(ptr + off % lpage_size, next - off); - } - } - - /* allocate address, map and copy */ - vm.flags = VM_ALLOC; - vm.size = chunk_size; - vm_area_register_early(&vm, ai->unit_size); - - for (i = 0; i < pcpul_nr_lpages; i++) { - if (!pcpul_map[i].ptr) - continue; - pcpul_map[i].map_addr = vm.addr + i * lpage_size; - map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); - } - - for_each_possible_cpu(cpu) - memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, - __per_cpu_load, ai->static_size); - - /* we're ready, commit */ - pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, - ai->unit_size); - - rc = pcpu_setup_first_chunk(ai, vm.addr); - - /* - * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped - * lpages are pushed to the end and trimmed. - */ - for (i = 0; i < pcpul_nr_lpages - 1; i++) - for (j = i + 1; j < pcpul_nr_lpages; j++) { - struct pcpul_ent tmp; - - if (!pcpul_map[j].ptr) - continue; - if (pcpul_map[i].ptr && - pcpul_map[i].ptr < pcpul_map[j].ptr) - continue; - - tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) - pcpul_nr_lpages--; - - return rc; - -enomem: - for (i = 0; i < pcpul_nr_lpages; i++) - if (pcpul_map[i].ptr) - free_fn(pcpul_map[i].ptr, lpage_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu large - * page mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used large - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - unsigned long lpage_mask = pcpul_lpage_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); - unsigned long offset = (unsigned long)kaddr & lpage_mask; - int left = 0, right = pcpul_nr_lpages - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < lpage_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > lpage_addr) - right = pos - 1; - else - return pcpul_map[pos].map_addr + offset; - } - - return NULL; -} -#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ - /* * Generic percpu area setup. * -- cgit v1.2.3-70-g09d2 From 58c41d28259c246dbc11358d85d332dc20ccd57b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 14 Aug 2009 12:14:19 -0700 Subject: x86, intel_txt: Factor out the code for S3 setup S3 sleep requires special setup in tboot. However, the data structures needed to do such setup are only available if CONFIG_ACPI_SLEEP is enabled. Abstract them out as much as possible, so we can have a single tboot_setup_sleep() which either is a proper implementation or a stub which simply calls BUG(). Signed-off-by: H. Peter Anvin Acked-by: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 53 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 1ab80120894..a183beffe39 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -164,25 +165,51 @@ void tboot_create_trampoline(void) map_base = PFN_DOWN(tboot->tboot_base); map_size = PFN_UP(tboot->tboot_size); if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) - panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); } -static void set_mac_regions(void) +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) { - tboot->num_mac_regions = 3; + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + /* S3 resume code */ - tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); - tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); /* AP trampoline code */ - tboot->mac_regions[1].start = - PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); - tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); /* kernel code + data + bss */ - tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); - tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - - PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; } +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + void tboot_shutdown(u32 shutdown_type) { void (*shutdown)(void); @@ -200,7 +227,8 @@ void tboot_shutdown(u32 shutdown_type) /* if this is S3 then set regions to MAC */ if (shutdown_type == TB_SHUTDOWN_S3) - set_mac_regions(); + if (tboot_setup_sleep()) + return; tboot->shutdown_type = shutdown_type; @@ -253,7 +281,6 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; /* we always use the 32b wakeup vector */ tboot->acpi_sinfo.vector_width = 32; - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { -- cgit v1.2.3-70-g09d2 From 1be396794897f80bfc8774719ba60309a9e3d374 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:20 +0200 Subject: timekeeping: Move reset of cycle_last for tsc clocksource to tsc change_clocksource resets the cycle_last value to zero then sets it to a value read from the clocksource. The reset to zero is required only for the TSC clocksource to make the read_tsc function work after a resume. The reason is that the TSC read function uses cycle_last to detect backwards going TSCs. In the resume case cycle_last contains the TSC value from the last update before the suspend. On resume the TSC starts counting from 0 again and would trip over the cycle_last comparison. This is subtle and surprising. Move the reset to a resume function in the tsc code. Signed-off-by: Martin Schwidefsky Acked-by: Thomas Gleixner Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134808.142191175@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 6 ++++++ kernel/time/timekeeping.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..968425422c4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -744,10 +744,16 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif +static void resume_tsc(void) +{ + clocksource_tsc.cycle_last = 0; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, .read = read_tsc, + .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 016a2591d71..b5673016089 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -295,7 +295,6 @@ static void change_clocksource(void) if (old->disable) old->disable(old); - clock->cycle_last = 0; clock->cycle_last = clock->read(clock); clock->error = 0; clock->xtime_nsec = 0; -- cgit v1.2.3-70-g09d2 From d4f587c67fc39e0030ddd718675e252e208da4d7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 14 Aug 2009 15:47:31 +0200 Subject: timekeeping: Increase granularity of read_persistent_clock() The persistent clock of some architectures (e.g. s390) have a better granularity than seconds. To reduce the delta between the host clock and the guest clock in a virtualized system change the read_persistent_clock function to return a struct timespec. Signed-off-by: Martin Schwidefsky Cc: Ingo Molnar Acked-by: John Stultz Cc: Daniel Walker LKML-Reference: <20090814134811.013873340@de.ibm.com> Signed-off-by: Thomas Gleixner --- arch/m68knommu/kernel/time.c | 5 ++-- arch/mips/dec/time.c | 5 ++-- arch/mips/lasat/ds1603.c | 5 ++-- arch/mips/lasat/sysctl.c | 8 ++++-- arch/mips/lemote/lm2e/setup.c | 5 ++-- arch/mips/mti-malta/malta-time.c | 5 ++-- arch/mips/pmc-sierra/yosemite/setup.c | 5 ++-- arch/mips/sibyte/swarm/setup.c | 15 +++++++--- arch/mips/sni/time.c | 5 ++-- arch/powerpc/kernel/time.c | 7 +++-- arch/s390/kernel/time.c | 22 +++------------ arch/sh/kernel/time.c | 6 ++-- arch/x86/kernel/rtc.c | 5 ++-- arch/xtensa/kernel/time.c | 5 ++-- include/linux/time.h | 2 +- kernel/time/timekeeping.c | 52 +++++++++++++++++++---------------- 16 files changed, 83 insertions(+), 74 deletions(-) (limited to 'arch/x86') diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index d182b2f7221..68432248515 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -72,9 +72,10 @@ static unsigned long read_rtc_mmss(void) return mktime(year, mon, day, hour, min, sec);; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return read_rtc_mmss(); + ts->tv_sec = read_rtc_mmss(); + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c index 463136e6685..02f505f23c3 100644 --- a/arch/mips/dec/time.c +++ b/arch/mips/dec/time.c @@ -18,7 +18,7 @@ #include #include -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, mon, day, hour, min, sec, real_year; unsigned long flags; @@ -53,7 +53,8 @@ unsigned long read_persistent_clock(void) year += real_year - 72 + 2000; - return mktime(year, mon, day, hour, min, sec); + ts->tv_sec = mktime(year, mon, day, hour, min, sec); + ts->tv_nsec = 0; } /* diff --git a/arch/mips/lasat/ds1603.c b/arch/mips/lasat/ds1603.c index 52cb1436a12..c6fd96ff118 100644 --- a/arch/mips/lasat/ds1603.c +++ b/arch/mips/lasat/ds1603.c @@ -135,7 +135,7 @@ static void rtc_end_op(void) lasat_ndelay(1000); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long word; unsigned long flags; @@ -147,7 +147,8 @@ unsigned long read_persistent_clock(void) rtc_end_op(); spin_unlock_irqrestore(&rtc_lock, flags); - return word; + ts->tv_sec = word; + ts->tv_nsec = 0; } int rtc_mips_set_mmss(unsigned long time) diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 8f88886feb1..3f04d4c406b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -92,10 +92,12 @@ static int rtctmp; int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, void *buffer, size_t *lenp, loff_t *ppos) { + struct timespec ts; int r; if (!write) { - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; /* check for time < 0 and set to 0 */ if (rtctmp < 0) rtctmp = 0; @@ -134,9 +136,11 @@ int sysctl_lasat_rtc(ctl_table *table, void *oldval, size_t *oldlenp, void *newval, size_t newlen) { + struct timespec ts; int r; - rtctmp = read_persistent_clock(); + read_persistent_clock(&ts); + rtctmp = ts.tv_sec; if (rtctmp < 0) rtctmp = 0; r = sysctl_intvec(table, oldval, oldlenp, newval, newlen); diff --git a/arch/mips/lemote/lm2e/setup.c b/arch/mips/lemote/lm2e/setup.c index ebd6ceaef2f..24b355df612 100644 --- a/arch/mips/lemote/lm2e/setup.c +++ b/arch/mips/lemote/lm2e/setup.c @@ -54,9 +54,10 @@ void __init plat_time_init(void) mips_hpt_frequency = cpu_clock_freq / 2; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } void (*__wbflush)(void); diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 0b97d47691f..3c6f190aa61 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -100,9 +100,10 @@ static unsigned int __init estimate_cpu_frequency(void) return count; } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return mc146818_get_cmos_time(); + ts->tv_sec = mc146818_get_cmos_time(); + ts->tv_nsec = 0; } static void __init plat_perf_setup(void) diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c index 2d3c0dca275..3498ac9c35a 100644 --- a/arch/mips/pmc-sierra/yosemite/setup.c +++ b/arch/mips/pmc-sierra/yosemite/setup.c @@ -70,7 +70,7 @@ void __init bus_error_init(void) } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned int year, month, day, hour, min, sec; unsigned long flags; @@ -92,7 +92,8 @@ unsigned long read_persistent_clock(void) m48t37_base->control = 0x00; spin_unlock_irqrestore(&rtc_lock, flags); - return mktime(year, month, day, hour, min, sec); + ts->tv_sec = mktime(year, month, day, hour, min, sec); + ts->tv_nsec = 0; } int rtc_mips_set_time(unsigned long tim) diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index 672e45d495a..623ffc933c4 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -87,19 +87,26 @@ enum swarm_rtc_type { enum swarm_rtc_type swarm_rtc_type; -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { + unsigned long sec; + switch (swarm_rtc_type) { case RTC_XICOR: - return xicor_get_time(); + sec = xicor_get_time(); + break; case RTC_M4LT81: - return m41t81_get_time(); + sec = m41t81_get_time(); + break; case RTC_NONE: default: - return mktime(2000, 1, 1, 0, 0, 0); + sec = mktime(2000, 1, 1, 0, 0, 0); + break; } + ts->tv_sec = sec; + tv->tv_nsec = 0; } int rtc_mips_set_time(unsigned long sec) diff --git a/arch/mips/sni/time.c b/arch/mips/sni/time.c index 0d9ec1a5c24..62df6a598e0 100644 --- a/arch/mips/sni/time.c +++ b/arch/mips/sni/time.c @@ -182,7 +182,8 @@ void __init plat_time_init(void) setup_pit_timer(); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - return -1; + ts->tv_sec = -1; + ts->tv_nsec = 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceea..ad63f30fe3d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -769,7 +769,7 @@ int update_persistent_clock(struct timespec now) return ppc_md.set_rtc_time(&tm); } -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { struct rtc_time tm; static int first = 1; @@ -787,8 +787,9 @@ unsigned long read_persistent_clock(void) if (!ppc_md.get_rtc_time) return 0; ppc_md.get_rtc_time(&tm); - return mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, - tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_sec = mktime(tm.tm_year+1900, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec); + ts->tv_nsec = 0; } /* clocksource code */ diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e76c2e7a8b9..a94ec48587b 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -182,12 +182,9 @@ static void timing_alert_interrupt(__u16 code) static void etr_reset(void); static void stp_reset(void); -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec ts; - - tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, &ts); - return ts.tv_sec; + tod_to_timeval(get_clock() - TOD_UNIX_EPOCH, ts); } static cycle_t read_tod_clock(struct clocksource *cs) @@ -248,7 +245,6 @@ void __init time_init(void) { struct timespec ts; unsigned long flags; - cycle_t now; /* Reset time synchronization interfaces. */ etr_reset(); @@ -266,20 +262,10 @@ void __init time_init(void) panic("Could not register TOD clock source"); /* - * The TOD clock is an accurate clock. The xtime should be - * initialized in a way that the difference between TOD and - * xtime is reasonably small. Too bad that timekeeping_init - * sets xtime.tv_nsec to zero. In addition the clock source - * change from the jiffies clock source to the TOD clock - * source add another error of up to 1/HZ second. The same - * function sets wall_to_monotonic to a value that is too - * small for /proc/uptime to be accurate. - * Reset xtime and wall_to_monotonic to sane values. + * Reset wall_to_monotonic to the initial timestamp created + * in head.S to get a precise value in /proc/uptime. */ write_seqlock_irqsave(&xtime_lock, flags); - now = get_clock(); - tod_to_timeval(now - TOD_UNIX_EPOCH, &xtime); - clocksource_tod.cycle_last = now; tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, &ts); set_normalized_timespec(&wall_to_monotonic, -ts.tv_sec, -ts.tv_nsec); write_sequnlock_irqrestore(&xtime_lock, flags); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 9b352a1e3fb..3f4706aa975 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -39,11 +39,9 @@ void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; #ifdef CONFIG_GENERIC_CMOS_UPDATE -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { - struct timespec tv; - rtc_sh_get_time(&tv); - return tv.tv_sec; + rtc_sh_get_time(&ts); } int update_persistent_clock(struct timespec now) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 5d465b207e7..bf67dcb4a44 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -178,7 +178,7 @@ static int set_rtc_mmss(unsigned long nowtime) } /* not static: needed by APM */ -unsigned long read_persistent_clock(void) +void read_persistent_clock(struct timespec *ts) { unsigned long retval, flags; @@ -186,7 +186,8 @@ unsigned long read_persistent_clock(void) retval = get_wallclock(); spin_unlock_irqrestore(&rtc_lock, flags); - return retval; + ts->tv_sec = retval; + ts->tv_nsec = 0; } int update_persistent_clock(struct timespec now) diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 8848120d291..19085ff0484 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -59,9 +59,8 @@ static struct irqaction timer_irqaction = { void __init time_init(void) { - xtime.tv_nsec = 0; - xtime.tv_sec = read_persistent_clock(); - + /* FIXME: xtime&wall_to_monotonic are set in timekeeping_init. */ + read_persistent_clock(&xtime); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); diff --git a/include/linux/time.h b/include/linux/time.h index e7c84455888..53a3216f0d1 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -101,7 +101,7 @@ extern struct timespec xtime; extern struct timespec wall_to_monotonic; extern seqlock_t xtime_lock; -extern unsigned long read_persistent_clock(void); +extern void read_persistent_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 41579e7fcf9..f1a21ce491e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -154,7 +154,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static unsigned long total_sleep_time; /* seconds */ +static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -487,17 +487,18 @@ int timekeeping_valid_for_hres(void) } /** - * read_persistent_clock - Return time in seconds from the persistent clock. + * read_persistent_clock - Return time from the persistent clock. * * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -unsigned long __attribute__((weak)) read_persistent_clock(void) +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) { - return 0; + ts->tv_sec = 0; + ts->tv_nsec = 0; } /* @@ -507,7 +508,9 @@ void __init timekeeping_init(void) { struct clocksource *clock; unsigned long flags; - unsigned long sec = read_persistent_clock(); + struct timespec now; + + read_persistent_clock(&now); write_seqlock_irqsave(&xtime_lock, flags); @@ -518,19 +521,20 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); update_xtime_cache(0); - total_sleep_time = 0; + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } /* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; +static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -543,18 +547,19 @@ static unsigned long timekeeping_suspend_time; static int timekeeping_resume(struct sys_device *dev) { unsigned long flags; - unsigned long now = read_persistent_clock(); + struct timespec ts; + + read_persistent_clock(&ts); clocksource_resume(); write_seqlock_irqsave(&xtime_lock, flags); - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - total_sleep_time += sleep_length; + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + xtime = timespec_add_safe(xtime, ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); + total_sleep_time = timespec_add_safe(total_sleep_time, ts); } update_xtime_cache(0); /* re-base the last cycle value */ @@ -577,7 +582,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; - timekeeping_suspend_time = read_persistent_clock(); + read_persistent_clock(&timekeeping_suspend_time); write_seqlock_irqsave(&xtime_lock, flags); timekeeping_forward_now(); @@ -801,9 +806,10 @@ void update_wall_time(void) */ void getboottime(struct timespec *ts) { - set_normalized_timespec(ts, - - (wall_to_monotonic.tv_sec + total_sleep_time), - - wall_to_monotonic.tv_nsec); + struct timespec boottime; + + boottime = timespec_add_safe(wall_to_monotonic, total_sleep_time); + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } /** @@ -812,7 +818,7 @@ void getboottime(struct timespec *ts) */ void monotonic_to_bootbased(struct timespec *ts) { - ts->tv_sec += total_sleep_time; + *ts = timespec_add_safe(*ts, total_sleep_time); } unsigned long get_seconds(void) -- cgit v1.2.3-70-g09d2 From 62a3207b8cf3de35368cdc3822b30b82d59eea95 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 17 Aug 2009 11:16:16 -0700 Subject: x86, intel_txt: Handle ACPI_SLEEP without X86_TRAMPOLINE On 32 bits, we can have CONFIG_ACPI_SLEEP set without implying CONFIG_X86_TRAMPOLINE. In that case, we simply do not need to mark the trampoline as a MAC region. Signed-off-by: H. Peter Anvin Cc: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a183beffe39..c2e760ca7b0 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -189,8 +189,12 @@ static int tboot_setup_sleep(void) /* S3 resume code */ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE /* AP trampoline code */ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + /* kernel code + data + bss */ add_mac_region(virt_to_phys(_text), _end - _text); -- cgit v1.2.3-70-g09d2 From b7f42ab2e237f08a5bbcefa17473e80eb05e725c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 17 Aug 2009 11:19:40 -0700 Subject: x86, apic: Move dmar_table_init() out of enable_IR() On an x2apic system, we got: [ 1.818072] ------------[ cut here ]------------ [ 1.820376] WARNING: at kernel/lockdep.c:2461 lockdep_trace_alloc+0xa5/0xe9() [ 1.835282] Hardware name: ASSY, [ 1.839006] Modules linked in: [ 1.841253] Pid: 1, comm: swapper Not tainted 2.6.31-rc5-tip-03926-g39aaa80-dirty #510 [ 1.858056] Call Trace: [ 1.859913] [] ? lockdep_trace_alloc+0xa5/0xe9 [ 1.876270] [] warn_slowpath_common+0x8d/0xd0 [ 1.879132] [] warn_slowpath_null+0x27/0x3d [ 1.896823] [] lockdep_trace_alloc+0xa5/0xe9 [ 1.900659] [] ? lock_release_holdtime+0x2f/0x199 [ 1.917188] [] kmem_cache_alloc_notrace+0x42/0x111 [ 1.922320] [] ? reserve_memtype+0x152/0x518 [ 1.938137] [] ? pat_pagerange_is_ram+0x4a/0x91 [ 1.941730] [] reserve_memtype+0x152/0x518 [ 1.958115] [] __ioremap_caller+0x1dd/0x30f [ 1.975507] [] ? acpi_os_map_memory+0x2a/0x47 [ 1.978987] [] ioremap_nocache+0x2a/0x40 [ 2.031400] [] ? trace_hardirqs_off+0x20/0x36 [ 2.036096] [] acpi_os_map_memory+0x2a/0x47 [ 2.046263] [] acpi_tb_verify_table+0x3d/0x85 [ 2.050349] [] ? _spin_unlock_irqrestore+0x50/0x76 [ 2.067327] [] acpi_get_table_with_size+0x64/0xd9 [ 2.070860] [] ? _spin_unlock_irqrestore+0x50/0x76 [ 2.088000] [] dmar_table_detect+0x33/0x70 [ 2.092047] [] dmar_table_init+0x43/0x428 [ 2.106854] [] enable_IR+0x1c/0x8d [ 2.110256] [] enable_IR_x2apic+0x7c/0x19e [ 2.127139] [] native_smp_prepare_cpus+0x139/0x3b8 [ 2.145175] [] kernel_init+0x71/0x1da [ 2.148913] [] child_rip+0xa/0x20 [ 2.152349] [] ? restore_args+0x0/0x30 [ 2.167931] [] ? kernel_init+0x0/0x1da [ 2.171671] [] ? child_rip+0x0/0x20 [ 2.187607] ---[ end trace a7919e7f17c0a725 ]--- Venkatesh Pallipadi said: | Looks like the problem started with this commit | | commit ce69a784504222c3ab6f1b3c357d09ec5772127a | Author: Gleb Natapov | Date: Mon Jul 20 15:24:17 2009 +0300 | | x86/apic: Enable x2APIC without interrupt remapping under KVM | | Before this commit, dmar_table_init() was getting called | with interrupts enabled and after this commit, it is getting | called with interrupts disabled. so try to move out dmar_table_init out of that function. Analyzed-by: Venkatesh Pallipadi Signed-off-by: Yinghai Lu Cc: Peter Zijlstra Cc: Gleb Natapov Cc: Suresh Siddha Cc: "Pallipadi, Venkatesh" LKML-Reference: <4A899F3C.2050104@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index de039fcdd05..3fc3a6c428d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1365,14 +1365,6 @@ void enable_x2apic(void) int __init enable_IR(void) { #ifdef CONFIG_INTR_REMAP - int ret; - - ret = dmar_table_init(); - if (ret) { - pr_debug("dmar_table_init() failed with %d:\n", ret); - return 0; - } - if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); return 0; @@ -1400,6 +1392,14 @@ void __init enable_IR_x2apic(void) unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; int ret, x2apic_enabled = 0; + int dmar_table_init_ret = 0; + +#ifdef CONFIG_INTR_REMAP + dmar_table_init_ret = dmar_table_init(); + if (dmar_table_init_ret) + pr_debug("dmar_table_init() failed with %d:\n", + dmar_table_init_ret); +#endif ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { @@ -1417,7 +1417,11 @@ void __init enable_IR_x2apic(void) mask_8259A(); mask_IO_APIC_setup(ioapic_entries); - ret = enable_IR(); + if (dmar_table_init_ret) + ret = 0; + else + ret = enable_IR(); + if (!ret) { /* IR is required if there is APIC ID > 255 even when running * under KVM -- cgit v1.2.3-70-g09d2 From 8126dec32738421afa362114337331337b4be17f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 20 Aug 2009 20:23:11 +0800 Subject: x86: Fix system crash when loading with "reservetop" parameter The system will die if the kernel is booted with "reservetop" parameter, in present code, parse "reservetop" parameter after early_ioremap_init(), and some function still use early_ioremap() after it. The problem is, "reservetop" parameter can modify 'FIXADDR_TOP', then the virtual address got by early_ioremap() is base on old 'FIXADDR_TOP', but the page mapping is base on new 'FIXADDR_TOP', it will occur page fault, and the IDT is not prepare yet, so, the system is dead. So, put parse_early_param() in the front of early_ioremap_init() in this patch. Signed-off-by: Xiao Guangrong Cc: yinghai@kernel.org Cc: Andrew Morton LKML-Reference: <4A8D402F.4080805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..02643cc3bf2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -711,6 +711,11 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); @@ -793,11 +798,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_early_param(); - #ifdef CONFIG_X86_64 check_efer(); #endif -- cgit v1.2.3-70-g09d2 From 3e0e1e9c5a327d4dba8490d83ef55c0564e6e8a7 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 21 Aug 2009 04:34:45 -0400 Subject: x86: Fix an incorrect argument of reserve_bootmem() This line looks suspicious, because if this is true, then the 'flags' parameter of function reserve_bootmem_generic() will be unused when !CONFIG_NUMA. I don't think this is what we want. Signed-off-by: WANG Cong Cc: Yinghai Lu Cc: akpm@linux-foundation.org LKML-Reference: <20090821083709.5098.52505.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6176fe8f29e..ea56b8cbb6a 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return ret; #else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + reserve_bootmem(phys, len, flags); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { -- cgit v1.2.3-70-g09d2 From 8cab02dc3c58a12235c6d463ce684dded9696848 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 18:19:45 +0200 Subject: x86: Do not unregister PIT clocksource on PIT oneshot setup/shutdown This basically reverts commit 1a0c009ac (x86: unregister PIT clocksource when PIT is disabled) because the problem which was tried to address with that patch has been solved by commit 3f68535ada (clocksource: sanity check sysfs clocksource changes). The problem addressed by the original patch is that PIT could be selected as clocksource after the system switched the PIT off or set the PIT into one shot mode which would result in complete timekeeping wreckage. Now with the sysfs sanity check in place PIT cannot be selected again when the system is in oneshot mode. The system will not switch to one shot mode as long as PIT is installed because PIT is not suitable for one shot. The shutdown case which happens when the lapic timer is installed is covered by the fact that init_pit_clocksource() is called after the lapic timer take over and then does not install the PIT clocksource at all. We should have done the sanity checks back then, but ... This also solves the locking problem which was reported vs. the clocksource rework. LKML-Reference: Cc: Martin Schwidefsky Cc: john stultz Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i8253.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 5cf36c053ac..23c167925a5 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -19,12 +19,6 @@ DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -#ifdef CONFIG_X86_32 -static void pit_disable_clocksource(void); -#else -static inline void pit_disable_clocksource(void) { } -#endif - /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode, outb_pit(0, PIT_CH0); outb_pit(0, PIT_CH0); } - pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - pit_disable_clocksource(); outb_pit(0x38, PIT_MODE); break; @@ -200,17 +192,6 @@ static struct clocksource pit_cs = { .shift = 20, }; -static void pit_disable_clocksource(void) -{ - /* - * Use mult to check whether it is registered or not - */ - if (pit_cs.mult) { - clocksource_unregister(&pit_cs); - pit_cs.mult = 0; - } -} - static int __init init_pit_clocksource(void) { /* -- cgit v1.2.3-70-g09d2 From da15cfdae03351c689736f8d142618592e3cebc3 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 19 Aug 2009 19:13:34 -0700 Subject: time: Introduce CLOCK_REALTIME_COARSE After talking with some application writers who want very fast, but not fine-grained timestamps, I decided to try to implement new clock_ids to clock_gettime(): CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE which returns the time at the last tick. This is very fast as we don't have to access any hardware (which can be very painful if you're using something like the acpi_pm clocksource), and we can even use the vdso clock_gettime() method to avoid the syscall. The only trade off is you only get low-res tick grained time resolution. This isn't a new idea, I know Ingo has a patch in the -rt tree that made the vsyscall gettimeofday() return coarse grained time when the vsyscall64 sysctrl was set to 2. However this affects all applications on a system. With this method, applications can choose the proper speed/granularity trade-off for themselves. Signed-off-by: John Stultz Cc: Andi Kleen Cc: nikolag@ca.ibm.com Cc: Darren Hart Cc: arjan@infradead.org Cc: jonathan@jonmasters.org LKML-Reference: <1250734414.6897.5.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/vgtod.h | 1 + arch/x86/kernel/vsyscall_64.c | 1 + arch/x86/vdso/vclock_gettime.c | 39 ++++++++++++++++++++++++++++++++++++--- include/linux/time.h | 4 ++++ kernel/posix-timers.c | 35 +++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index dc27a69e5d2..3d61e204826 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -21,6 +21,7 @@ struct vsyscall_gtod_data { u32 shift; } clock; struct timespec wall_to_monotonic; + struct timespec wall_time_coarse; }; extern struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 25ee06a80aa..cf53a78e2dc 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6a40b78b46a..ee55754cc3c 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -86,14 +86,47 @@ notrace static noinline int do_monotonic(struct timespec *ts) return 0; } +notrace static noinline int do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = read_seqbegin(>od->lock); + ts->tv_sec = gtod->wall_time_coarse.tv_sec; + ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + return 0; +} + +notrace static noinline int do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq, ns, secs; + do { + seq = read_seqbegin(>od->lock); + secs = gtod->wall_time_coarse.tv_sec; + ns = gtod->wall_time_coarse.tv_nsec; + secs += gtod->wall_to_monotonic.tv_sec; + ns += gtod->wall_to_monotonic.tv_nsec; + } while (unlikely(read_seqretry(>od->lock, seq))); + vset_normalized_timespec(ts, secs, ns); + return 0; +} + notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) + if (likely(gtod->sysctl_enabled)) switch (clock) { case CLOCK_REALTIME: - return do_realtime(ts); + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; case CLOCK_MONOTONIC: - return do_monotonic(ts); + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } diff --git a/include/linux/time.h b/include/linux/time.h index f505988398e..256232f7e5e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -110,6 +110,8 @@ extern int timekeeping_suspended; unsigned long get_seconds(void); struct timespec current_kernel_time(void); +struct timespec __current_kernel_time(void); /* does not hold xtime_lock */ +struct timespec get_monotonic_coarse(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) @@ -243,6 +245,8 @@ struct itimerval { #define CLOCK_PROCESS_CPUTIME_ID 2 #define CLOCK_THREAD_CPUTIME_ID 3 #define CLOCK_MONOTONIC_RAW 4 +#define CLOCK_REALTIME_COARSE 5 +#define CLOCK_MONOTONIC_COARSE 6 /* * The IDs of various hardware clocks: diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d089d052c4a..495440779ce 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) return 0; } + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -262,10 +281,26 @@ static __init int init_posix_timers(void) .timer_create = no_timer_create, .nsleep = no_nsleep, }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 15e06defca5..03cbeb34d14 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -847,6 +847,10 @@ unsigned long get_seconds(void) } EXPORT_SYMBOL(get_seconds); +struct timespec __current_kernel_time(void) +{ + return xtime_cache; +} struct timespec current_kernel_time(void) { @@ -862,3 +866,20 @@ struct timespec current_kernel_time(void) return now; } EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} -- cgit v1.2.3-70-g09d2 From 8b5a10fc6fd02289ea03480f93382b1a99006142 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 19 Aug 2009 08:40:48 +0100 Subject: x86: properly annotate alternatives.c Some of the NOPs tables aren't used on 64-bits, quite some code and data is needed post-init for module loading only, and a couple of functions aren't used outside that file (i.e. can be static, and don't need to be exported). The change to __INITDATA/__INITRODATA is needed to avoid an assembler warning. Signed-off-by: Jan Beulich LKML-Reference: <4A8BC8A00200007800010823@vpn.id2.novell.com> Acked-by: Sam Ravnborg Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 7 ----- arch/x86/kernel/alternative.c | 56 ++++++++++++++++++++++---------------- include/linux/init.h | 12 ++++++-- 3 files changed, 42 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 1a37bcdc860..c240efc74e0 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -73,8 +73,6 @@ static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} #endif /* CONFIG_SMP */ -const unsigned char *const *find_nop_table(void); - /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ \ @@ -144,8 +142,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif -extern void add_nops(void *insns, unsigned int len); - /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. @@ -161,10 +157,7 @@ extern void add_nops(void *insns, unsigned int len); * Intel's errata. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. - * The _early version expects the memory to already be RW. */ - extern void *text_poke(void *addr, const void *opcode, size_t len); -extern void *text_poke_early(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f5765870257..486935143e0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ __setup("smp-alt-boot", bootonly); #define smp_alt_once 1 #endif -static int debug_alternative; +static int __initdata_or_module debug_alternative; static int __init debug_alt(char *str) { @@ -51,7 +52,7 @@ static int __init setup_noreplace_smp(char *str) __setup("noreplace-smp", setup_noreplace_smp); #ifdef CONFIG_PARAVIRT -static int noreplace_paravirt = 0; +static int __initdata_or_module noreplace_paravirt = 0; static int __init setup_noreplace_paravirt(char *str) { @@ -64,16 +65,17 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) -#ifdef GENERIC_NOP1 +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ -asm("\t.section .rodata, \"a\"\nintelnops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8 "\t.previous"); extern const unsigned char intelnops[]; -static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, intelnops + 1, @@ -87,12 +89,13 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = { #endif #ifdef K8_NOP1 -asm("\t.section .rodata, \"a\"\nk8nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 K8_NOP7 K8_NOP8 "\t.previous"); extern const unsigned char k8nops[]; -static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, k8nops + 1, @@ -105,13 +108,14 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = { }; #endif -#ifdef K7_NOP1 -asm("\t.section .rodata, \"a\"\nk7nops: " +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 K7_NOP7 K7_NOP8 "\t.previous"); extern const unsigned char k7nops[]; -static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, k7nops + 1, @@ -125,12 +129,13 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = { #endif #ifdef P6_NOP1 -asm("\t.section .rodata, \"a\"\np6nops: " +asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 P6_NOP7 P6_NOP8 "\t.previous"); extern const unsigned char p6nops[]; -static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char *const __initconst_or_module +p6_nops[ASM_NOP_MAX+1] = { NULL, p6nops, p6nops + 1, @@ -146,7 +151,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { #ifdef CONFIG_X86_64 extern char __vsyscall_0; -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_has(X86_FEATURE_NOPL)) @@ -157,7 +162,7 @@ const unsigned char *const *find_nop_table(void) #else /* CONFIG_X86_64 */ -const unsigned char *const *find_nop_table(void) +static const unsigned char *const *__init_or_module find_nop_table(void) { if (boot_cpu_has(X86_FEATURE_K8)) return k8_nops; @@ -172,7 +177,7 @@ const unsigned char *const *find_nop_table(void) #endif /* CONFIG_X86_64 */ /* Use this to add nops to a buffer, then text_poke the whole buffer. */ -void add_nops(void *insns, unsigned int len) +static void __init_or_module add_nops(void *insns, unsigned int len) { const unsigned char *const *noptable = find_nop_table(); @@ -185,10 +190,10 @@ void add_nops(void *insns, unsigned int len) len -= noplen; } } -EXPORT_SYMBOL_GPL(add_nops); extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern u8 *__smp_locks[], *__smp_locks_end[]; +static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -196,7 +201,8 @@ extern u8 *__smp_locks[], *__smp_locks_end[]; APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) { struct alt_instr *a; char insnbuf[MAX_PATCH_LEN]; @@ -279,9 +285,10 @@ static LIST_HEAD(smp_alt_modules); static DEFINE_MUTEX(smp_alt); static int smp_mode = 1; /* protected by smp_alt */ -void alternatives_smp_module_add(struct module *mod, char *name, - void *locks, void *locks_end, - void *text, void *text_end) +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) { struct smp_alt_module *smp; @@ -317,7 +324,7 @@ void alternatives_smp_module_add(struct module *mod, char *name, mutex_unlock(&smp_alt); } -void alternatives_smp_module_del(struct module *mod) +void __init_or_module alternatives_smp_module_del(struct module *mod) { struct smp_alt_module *item; @@ -386,8 +393,8 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT -void apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) { struct paravirt_patch_site *p; char insnbuf[MAX_PATCH_LEN]; @@ -485,7 +492,8 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *text_poke_early(void *addr, const void *opcode, size_t len) +static void *__init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; local_irq_save(flags); diff --git a/include/linux/init.h b/include/linux/init.h index 13b633ed695..400adbb4541 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -103,8 +103,8 @@ #define __INIT .section ".init.text","ax" #define __FINIT .previous -#define __INITDATA .section ".init.data","aw" -#define __INITRODATA .section ".init.rodata","a" +#define __INITDATA .section ".init.data","aw",%progbits +#define __INITRODATA .section ".init.rodata","a",%progbits #define __FINITDATA .previous #define __DEVINIT .section ".devinit.text", "ax" @@ -305,9 +305,17 @@ void __init parse_early_options(char *cmdline); #ifdef CONFIG_MODULES #define __init_or_module #define __initdata_or_module +#define __initconst_or_module +#define __INIT_OR_MODULE .text +#define __INITDATA_OR_MODULE .data +#define __INITRODATA_OR_MODULE .section ".rodata","a",%progbits #else #define __init_or_module __init #define __initdata_or_module __initdata +#define __initconst_or_module __initconst +#define __INIT_OR_MODULE __INIT +#define __INITDATA_OR_MODULE __INITDATA +#define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ /* Functions marked as __devexit may be discarded at kernel link time, depending -- cgit v1.2.3-70-g09d2 From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 7 +++++++ arch/x86/kernel/cpu/mtrr/main.c | 46 +++++++++++++++++++++++++++++++++-------- arch/x86/kernel/smpboot.c | 14 +++++++++++++ arch/x86/power/cpu.c | 2 +- kernel/cpu.c | 14 +++++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467d..d5366ec5cb8 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a416..7339be0aa58 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +u32 mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = 1; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = 0; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..d720b7e0cf3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac6..417c9f5b4af 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); + mtrr_bp_restore(); #ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..f5f9485b8c0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); -- cgit v1.2.3-70-g09d2 From 5400743db5a06a4e6e298725a2044c40edcb27b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Aug 2009 17:00:02 -0700 Subject: x86, mtrr: make mtrr_aps_delayed_init static bool mtr_aps_delayed_init was declared u32 and made global, but it only ever takes boolean values and is only ever used in arch/x86/kernel/cpu/mtrr/main.c. Declare it "static bool" and remove external references. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/include/asm/mtrr.h | 1 - arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index d5366ec5cb8..4365ffdb461 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -126,7 +126,6 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7339be0aa58..84e83de5457 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,7 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -u32 mtrr_aps_delayed_init; +static bool mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -758,7 +758,7 @@ void set_mtrr_aps_delayed_init(void) if (!use_intel()) return; - mtrr_aps_delayed_init = 1; + mtrr_aps_delayed_init = true; } /* @@ -770,7 +770,7 @@ void mtrr_aps_init(void) return; set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = 0; + mtrr_aps_delayed_init = false; } void mtrr_bp_restore(void) -- cgit v1.2.3-70-g09d2 From 366d19e181be873c70f4aafca3931d77d781ccd7 Mon Sep 17 00:00:00 2001 From: Tobias Doerffel Date: Fri, 21 Aug 2009 23:06:23 +0200 Subject: x86: add specific support for Intel Atom architecture Add another option when selecting CPU family so the kernel can be optimized for Intel Atom CPUs. If GCC supports tuning options for Intel Atom they will be used. Signed-off-by: Tobias Doerffel Signed-off-by: H. Peter Anvin LKML-Reference: <1251018457-19157-1-git-send-email-tobias.doerffel@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 19 ++++++++++++++----- arch/x86/Makefile | 2 ++ arch/x86/Makefile_32.cpu | 2 ++ arch/x86/include/asm/module.h | 2 ++ 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c..527519b8a9f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -262,6 +262,15 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) +config MATOM + bool "Intel Atom" + ---help--- + + Select this for the Intel Atom platform. Intel Atom CPUs have an + in-order pipelining architecture and thus can benefit from + accordingly optimized code. Use a recent GCC with specific Atom + support in order to fully benefit from selecting this option. + config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 @@ -295,7 +304,7 @@ config X86_CPU config X86_L1_CACHE_BYTES int default "128" if MPSC - default "64" if GENERIC_CPU || MK8 || MCORE2 || X86_32 + default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 config X86_INTERNODE_CACHE_BYTES int @@ -310,7 +319,7 @@ config X86_L1_CACHE_SHIFT default "7" if MPENTIUM4 || MPSC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU config X86_XADD def_bool y @@ -359,7 +368,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM config X86_USE_3DNOW def_bool y @@ -387,7 +396,7 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 + depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 config X86_CMPXCHG64 def_bool y @@ -397,7 +406,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b..8a4c24c96d0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -55,6 +55,8 @@ else cflags-$(CONFIG_MCORE2) += \ $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ + $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 80177ec052f..30e9a264f69 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -33,6 +33,8 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-f cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) +cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ + $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 47d62743c4d..e959c4afab5 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -28,6 +28,8 @@ struct mod_arch_specific {}; #define MODULE_PROC_FAMILY "586MMX " #elif defined CONFIG_MCORE2 #define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MATOM +#define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 #define MODULE_PROC_FAMILY "686 " #elif defined CONFIG_MPENTIUMII -- cgit v1.2.3-70-g09d2 From 10f02d1168585edf66229bb2ec90a42f32667a78 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 23 Aug 2009 23:17:27 +0400 Subject: x86: uv: Clean up uv_ptc_init(), use proc_create() create_proc_entry() is getting duhprecated. Signed-off-by: Alexey Dobriyan Cc: cpw@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 77b9689f8ed..503c1f2e883 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -640,13 +640,13 @@ static int __init uv_ptc_init(void) if (!is_uv_system()) return 0; - proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); + proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, + &proc_uv_ptc_operations); if (!proc_uv_ptc) { printk(KERN_ERR "unable to create %s proc entry\n", UV_PTC_BASENAME); return -EINVAL; } - proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; return 0; } -- cgit v1.2.3-70-g09d2 From 005155b1f626d2b2d7932e4afdf4fead168c6888 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 25 Aug 2009 15:35:12 +0200 Subject: x86: Fix x86_model test in es7000_apic_is_cluster() For the x86_model to be greater than 6 or less than 12 is logically always true. Signed-off-by: Roel Kluin Cc: Andrew Morton Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/es7000_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 69328ac8de9..420f95da7bf 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -167,7 +167,7 @@ static int es7000_apic_is_cluster(void) { /* MPENTIUMIII */ if (boot_cpu_data.x86 == 6 && - (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) + (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) return 1; return 0; -- cgit v1.2.3-70-g09d2 From 667000011927b4fcc359beac4a2447889db6d349 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:11 -0700 Subject: tracing: Rename FTRACE_SYSCALLS for tracepoints s/HAVE_FTRACE_SYSCALLS/HAVE_SYSCALL_TRACEPOINTS/g s/TIF_SYSCALL_FTRACE/TIF_SYSCALL_TRACEPOINT/g The syscall enter/exit tracing is no longer specific to just ftrace, so they now have names that reflect their tie to tracepoints instead. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-2-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/Kconfig | 2 +- arch/s390/defconfig | 2 +- arch/s390/include/asm/thread_info.h | 4 ++-- arch/s390/kernel/entry.S | 2 +- arch/s390/kernel/entry64.S | 2 +- arch/s390/kernel/ptrace.c | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/include/asm/thread_info.h | 13 +++++++------ arch/x86/kernel/ptrace.c | 4 ++-- kernel/trace/Kconfig | 4 ++-- kernel/tracepoint.c | 4 ++-- 13 files changed, 24 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2ae5d72f47e..7238ef4c7a6 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -84,7 +84,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER select HAVE_DEFAULT_NO_SPIN_MUTEXES diff --git a/arch/s390/defconfig b/arch/s390/defconfig index fcba206529f..4e91a2573cc 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -900,7 +900,7 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_TRACING_SUPPORT=y CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index ba1cab9fc1f..07eb61b2fb3 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -92,7 +92,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing active */ #define TIF_SECCOMP 10 /* secure computing */ -#define TIF_SYSCALL_FTRACE 11 /* ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 11 /* syscall tracepoint instrumentation */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -111,7 +111,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_TRACE (1<>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER STACK_SIZE = 1 << STACK_SHIFT diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index f6618e9e15e..3ceb53c9c49 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -57,7 +57,7 @@ _TIF_WORK_SVC = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_MCCK_PENDING) _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | _TIF_SYSCALL_AUDIT>>8 | \ - _TIF_SECCOMP>>8 | _TIF_SYSCALL_FTRACE>>8) + _TIF_SECCOMP>>8 | _TIF_SYSCALL_TRACEPOINT>>8) #define BASED(name) name-system_call(%r13) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index c5e87d891ca..9d3dcfa79ea 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -664,7 +664,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) ret = -1; } - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_enter(regs, regs->gprs[2]); if (unlikely(current->audit_context)) @@ -682,7 +682,7 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_exit(regs, regs->gprs[2]); if (test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..d59cbf758f3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -37,7 +37,7 @@ config X86 select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE - select HAVE_FTRACE_SYSCALLS + select HAVE_SYSCALL_TRACEPOINTS select HAVE_KVM select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index edb992ebef9..d28fad19654 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -2355,7 +2355,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index cee1dd2e69b..6c86acd847a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -2329,7 +2329,7 @@ CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y -CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y CONFIG_TRACING_SUPPORT=y diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fad7d40b75f..6f7786aea4f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_FTRACE 28 /* for ftrace syscall instrumentation */ +#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -118,17 +118,17 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_FTRACE (1 << TIF_SYSCALL_FTRACE) +#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_FTRACE | \ - _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP) + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_FTRACE) + _TIF_SYSCALL_TRACEPOINT) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -137,7 +137,8 @@ struct thread_info { _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) /* work to do on any return to user space */ -#define _TIF_ALLWORK_MASK ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_FTRACE) +#define _TIF_ALLWORK_MASK \ + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 34dd6f15185..a909afef44f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1500,7 +1500,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) tracehook_report_syscall_entry(regs)) ret = -1L; - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { @@ -1526,7 +1526,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if (unlikely(test_thread_flag(TIF_SYSCALL_FTRACE))) + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_syscall_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 019f380fd76..06be85a7ef8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -41,7 +41,7 @@ config HAVE_FTRACE_MCOUNT_RECORD config HAVE_HW_BRANCH_TRACER bool -config HAVE_FTRACE_SYSCALLS +config HAVE_SYSCALL_TRACEPOINTS bool config TRACER_MAX_TRACE @@ -211,7 +211,7 @@ config ENABLE_DEFAULT_TRACERS config FTRACE_SYSCALLS bool "Trace syscalls" - depends on HAVE_FTRACE_SYSCALLS + depends on HAVE_SYSCALL_TRACEPOINTS select GENERIC_TRACER select KALLSYMS help diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 06f165a4408..be86b9a01a0 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -590,7 +590,7 @@ void syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } @@ -608,7 +608,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); + clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } -- cgit v1.2.3-70-g09d2 From 97419875865859fd2403e66266c02ce028e2f5ab Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:13 -0700 Subject: tracing: Move tracepoint callbacks from declaration to definition It's not strictly correct for the tracepoint reg/unreg callbacks to occur when a client is hooking up, because the actual tracepoint may not be present yet. This happens to be fine for syscall, since that's in the core kernel, but it would cause problems for tracepoints defined in a module that hasn't been loaded yet. It also means the reg/unreg has to be EXPORTed for any modules to use the tracepoint (as in SystemTap). This patch removes DECLARE_TRACE_WITH_CALLBACK, and instead introduces DEFINE_TRACE_FN which stores the callbacks in struct tracepoint. The callbacks are used now when the active state of the tracepoint changes in set_tracepoint & disable_tracepoint. This also introduces TRACE_EVENT_FN, so ftrace events can also provide registration callbacks if needed. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-4-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/kernel/ptrace.c | 4 ++-- arch/x86/kernel/ptrace.c | 4 ++-- include/linux/tracepoint.h | 46 +++++++++++++++++--------------------------- include/trace/define_trace.h | 5 +++++ include/trace/ftrace.h | 9 +++++++++ include/trace/syscall.h | 12 ++++-------- kernel/tracepoint.c | 14 +++++++++----- 7 files changed, 49 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 9d3dcfa79ea..c05b44b80c2 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -51,8 +51,8 @@ #include "compat_ptrace.h" #endif -DEFINE_TRACE(syscall_enter); -DEFINE_TRACE(syscall_exit); +DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); +DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); enum s390_regset { REGSET_GENERAL, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a909afef44f..31e9b97ec4d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,8 +37,8 @@ #include -DEFINE_TRACE(syscall_enter); -DEFINE_TRACE(syscall_exit); +DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); +DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); #include "tls.h" diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 5984ed04c03..846a4ae501e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -23,6 +23,8 @@ struct tracepoint; struct tracepoint { const char *name; /* Tracepoint name */ int state; /* State. */ + void (*regfunc)(void); + void (*unregfunc)(void); void **funcs; } __attribute__((aligned(32))); /* * Aligned on 32 bytes because it is @@ -60,10 +62,8 @@ struct tracepoint { * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the * structure. Force alignment to the same alignment as the section start. - * An optional set of (un)registration functions can be passed to perform any - * additional (un)registration work. */ -#define DECLARE_TRACE_WITH_CALLBACK(name, proto, args, reg, unreg) \ +#define DECLARE_TRACE(name, proto, args) \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ @@ -73,36 +73,23 @@ struct tracepoint { } \ static inline int register_trace_##name(void (*probe)(proto)) \ { \ - int ret; \ - void (*func)(void) = reg; \ - \ - ret = tracepoint_probe_register(#name, (void *)probe); \ - if (func && !ret) \ - func(); \ - return ret; \ + return tracepoint_probe_register(#name, (void *)probe); \ } \ static inline int unregister_trace_##name(void (*probe)(proto)) \ { \ - int ret; \ - void (*func)(void) = unreg; \ - \ - ret = tracepoint_probe_unregister(#name, (void *)probe);\ - if (func && !ret) \ - func(); \ - return ret; \ + return tracepoint_probe_unregister(#name, (void *)probe);\ } -#define DECLARE_TRACE(name, proto, args) \ - DECLARE_TRACE_WITH_CALLBACK(name, TP_PROTO(proto), TP_ARGS(args),\ - NULL, NULL); - -#define DEFINE_TRACE(name) \ +#define DEFINE_TRACE_FN(name, reg, unreg) \ static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"), aligned(32))) = \ - { __tpstrtab_##name, 0, NULL } + { __tpstrtab_##name, 0, reg, unreg, NULL } + +#define DEFINE_TRACE(name) \ + DEFINE_TRACE_FN(name, NULL, NULL); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name) @@ -113,7 +100,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end); #else /* !CONFIG_TRACEPOINTS */ -#define DECLARE_TRACE_WITH_CALLBACK(name, proto, args, reg, unreg) \ +#define DECLARE_TRACE(name, proto, args) \ static inline void _do_trace_##name(struct tracepoint *tp, proto) \ { } \ static inline void trace_##name(proto) \ @@ -127,10 +114,7 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin, return -ENOSYS; \ } -#define DECLARE_TRACE(name, proto, args) \ - DECLARE_TRACE_WITH_CALLBACK(name, TP_PROTO(proto), TP_ARGS(args),\ - NULL, NULL); - +#define DEFINE_TRACE_FN(name, reg, unreg) #define DEFINE_TRACE(name) #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) #define EXPORT_TRACEPOINT_SYMBOL(name) @@ -282,10 +266,16 @@ static inline void tracepoint_synchronize_unregister(void) * can also by used by generic instrumentation like SystemTap), and * it is also used to expose a structured trace record in * /sys/kernel/debug/tracing/events/. + * + * A set of (un)registration functions can be passed to the variant + * TRACE_EVENT_FN to perform any (un)registration work. */ #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#define TRACE_EVENT_FN(name, proto, args, struct, \ + assign, print, reg, unreg) \ + DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) #endif #endif diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index f7a7ae1e8f9..2a969850736 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,6 +26,11 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + DEFINE_TRACE_FN(name, reg, unreg) + #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 127400255e4..3a0b44bdabf 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -42,6 +42,15 @@ }; \ static struct ftrace_event_call event_##name +/* Callbacks are meaningless to ftrace. */ +#undef TRACE_EVENT_FN +#define TRACE_EVENT_FN(name, proto, args, tstruct, \ + assign, print, reg, unreg) \ + TRACE_EVENT(name, TP_PROTO(proto), TP_ARGS(args), \ + TP_STRUCT__entry(tstruct), \ + TP_fast_assign(assign), \ + TP_printk(print)) + #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 5dcb7e3a544..4e194300185 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -13,18 +13,14 @@ extern void syscall_regfunc(void); extern void syscall_unregfunc(void); -DECLARE_TRACE_WITH_CALLBACK(syscall_enter, +DECLARE_TRACE(syscall_enter, TP_PROTO(struct pt_regs *regs, long id), - TP_ARGS(regs, id), - syscall_regfunc, - syscall_unregfunc + TP_ARGS(regs, id) ); -DECLARE_TRACE_WITH_CALLBACK(syscall_exit, +DECLARE_TRACE(syscall_exit, TP_PROTO(struct pt_regs *regs, long ret), - TP_ARGS(regs, ret), - syscall_regfunc, - syscall_unregfunc + TP_ARGS(regs, ret) ); #endif diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9e0a36f0e2a..1a6a453b7ef 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -243,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); + if (elem->regfunc && !elem->state && active) + elem->regfunc(); + else if (elem->unregfunc && elem->state && !active) + elem->unregfunc(); + /* * rcu_assign_pointer has a smp_wmb() which makes sure that the new * probe callbacks array is consistent before setting a pointer to it. @@ -262,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { + if (elem->unregfunc && elem->state) + elem->unregfunc(); + elem->state = 0; rcu_assign_pointer(elem->funcs, NULL); } @@ -578,7 +586,7 @@ __initcall(init_tracepoints); #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS -static DEFINE_MUTEX(regfunc_mutex); +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ static int sys_tracepoint_refcount; void syscall_regfunc(void) @@ -586,7 +594,6 @@ void syscall_regfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, t) { @@ -595,7 +602,6 @@ void syscall_regfunc(void) read_unlock_irqrestore(&tasklist_lock, flags); } sys_tracepoint_refcount++; - mutex_unlock(®func_mutex); } void syscall_unregfunc(void) @@ -603,7 +609,6 @@ void syscall_unregfunc(void) unsigned long flags; struct task_struct *g, *t; - mutex_lock(®func_mutex); sys_tracepoint_refcount--; if (!sys_tracepoint_refcount) { read_lock_irqsave(&tasklist_lock, flags); @@ -612,6 +617,5 @@ void syscall_unregfunc(void) } while_each_thread(g, t); read_unlock_irqrestore(&tasklist_lock, flags); } - mutex_unlock(®func_mutex); } #endif -- cgit v1.2.3-70-g09d2 From 1c569f0264ea629c10bbab471dd0626ce4d3f19f Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Mon, 24 Aug 2009 14:43:14 -0700 Subject: tracing: Create generic syscall TRACE_EVENTs This converts the syscall_enter/exit tracepoints into TRACE_EVENTs, so you can have generic ftrace events that capture all system calls with arguments and return values. These generic events are also renamed to sys_enter/exit, so they're more closely aligned to the specific sys_enter_foo events. Signed-off-by: Josh Stone Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zefan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Lai Jiangshan Cc: Paul Mundt Cc: Martin Schwidefsky Cc: Heiko Carstens LKML-Reference: <1251150194-1713-5-git-send-email-jistone@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/s390/kernel/ptrace.c | 8 ++--- arch/x86/kernel/ptrace.c | 12 +++---- include/trace/events/syscalls.h | 70 +++++++++++++++++++++++++++++++++++++++++ include/trace/syscall.h | 17 ---------- kernel/trace/trace_syscalls.c | 17 +++++----- 5 files changed, 88 insertions(+), 36 deletions(-) create mode 100644 include/trace/events/syscalls.h (limited to 'arch/x86') diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index c05b44b80c2..f3ddd7ac06c 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -51,8 +51,8 @@ #include "compat_ptrace.h" #endif -DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); -DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); +#define CREATE_TRACE_POINTS +#include enum s390_regset { REGSET_GENERAL, @@ -665,7 +665,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) } if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_enter(regs, regs->gprs[2]); + trace_sys_enter(regs, regs->gprs[2]); if (unlikely(current->audit_context)) audit_syscall_entry(is_compat_task() ? @@ -683,7 +683,7 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) regs->gprs[2]); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_exit(regs, regs->gprs[2]); + trace_sys_exit(regs, regs->gprs[2]); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 31e9b97ec4d..8d7d5c9c1be 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -35,13 +35,11 @@ #include #include -#include - -DEFINE_TRACE_FN(syscall_enter, syscall_regfunc, syscall_unregfunc); -DEFINE_TRACE_FN(syscall_exit, syscall_regfunc, syscall_unregfunc); - #include "tls.h" +#define CREATE_TRACE_POINTS +#include + enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1501,7 +1499,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) ret = -1L; if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_enter(regs, regs->orig_ax); + trace_sys_enter(regs, regs->orig_ax); if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1527,7 +1525,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_syscall_exit(regs, regs->ax); + trace_sys_exit(regs, regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, 0); diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h new file mode 100644 index 00000000000..397dff2dbd5 --- /dev/null +++ b/include/trace/events/syscalls.h @@ -0,0 +1,70 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM syscalls + +#if !defined(_TRACE_EVENTS_SYSCALLS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENTS_SYSCALLS_H + +#include + +#include +#include + + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + +extern void syscall_regfunc(void); +extern void syscall_unregfunc(void); + +TRACE_EVENT_FN(sys_enter, + + TP_PROTO(struct pt_regs *regs, long id), + + TP_ARGS(regs, id), + + TP_STRUCT__entry( + __field( long, id ) + __array( unsigned long, args, 6 ) + ), + + TP_fast_assign( + __entry->id = id; + syscall_get_arguments(current, regs, 0, 6, __entry->args); + ), + + TP_printk("NR %ld (%lx, %lx, %lx, %lx, %lx, %lx)", + __entry->id, + __entry->args[0], __entry->args[1], __entry->args[2], + __entry->args[3], __entry->args[4], __entry->args[5]), + + syscall_regfunc, syscall_unregfunc +); + +TRACE_EVENT_FN(sys_exit, + + TP_PROTO(struct pt_regs *regs, long ret), + + TP_ARGS(regs, ret), + + TP_STRUCT__entry( + __field( long, id ) + __field( long, ret ) + ), + + TP_fast_assign( + __entry->id = syscall_get_nr(current, regs); + __entry->ret = ret; + ), + + TP_printk("NR %ld = %ld", + __entry->id, __entry->ret), + + syscall_regfunc, syscall_unregfunc +); + +#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */ + +#endif /* _TRACE_EVENTS_SYSCALLS_H */ + +/* This part must be outside protection */ +#include + diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 4e194300185..5dc283ba5ae 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -8,23 +8,6 @@ #include -#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS - -extern void syscall_regfunc(void); -extern void syscall_unregfunc(void); - -DECLARE_TRACE(syscall_enter, - TP_PROTO(struct pt_regs *regs, long id), - TP_ARGS(regs, id) -); - -DECLARE_TRACE(syscall_exit, - TP_PROTO(struct pt_regs *regs, long ret), - TP_ARGS(regs, ret) -); - -#endif - /* * A syscall entry in the ftrace syscalls array. * diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 46c1b977a2c..2698fe401eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -286,7 +287,7 @@ int reg_event_syscall_enter(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) - ret = register_trace_syscall_enter(ftrace_syscall_enter); + ret = register_trace_sys_enter(ftrace_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -311,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) sys_refcount_enter--; clear_bit(num, enabled_enter_syscalls); if (!sys_refcount_enter) - unregister_trace_syscall_enter(ftrace_syscall_enter); + unregister_trace_sys_enter(ftrace_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -327,7 +328,7 @@ int reg_event_syscall_exit(void *ptr) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) - ret = register_trace_syscall_exit(ftrace_syscall_exit); + ret = register_trace_sys_exit(ftrace_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); @@ -352,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) sys_refcount_exit--; clear_bit(num, enabled_exit_syscalls); if (!sys_refcount_exit) - unregister_trace_syscall_exit(ftrace_syscall_exit); + unregister_trace_sys_exit(ftrace_syscall_exit); mutex_unlock(&syscall_trace_lock); } @@ -418,7 +419,7 @@ int reg_prof_syscall_enter(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_enter) - ret = register_trace_syscall_enter(prof_syscall_enter); + ret = register_trace_sys_enter(prof_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -442,7 +443,7 @@ void unreg_prof_syscall_enter(char *name) sys_prof_refcount_enter--; clear_bit(num, enabled_prof_enter_syscalls); if (!sys_prof_refcount_enter) - unregister_trace_syscall_enter(prof_syscall_enter); + unregister_trace_sys_enter(prof_syscall_enter); mutex_unlock(&syscall_trace_lock); } @@ -479,7 +480,7 @@ int reg_prof_syscall_exit(char *name) mutex_lock(&syscall_trace_lock); if (!sys_prof_refcount_exit) - ret = register_trace_syscall_exit(prof_syscall_exit); + ret = register_trace_sys_exit(prof_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -503,7 +504,7 @@ void unreg_prof_syscall_exit(char *name) sys_prof_refcount_exit--; clear_bit(num, enabled_prof_exit_syscalls); if (!sys_prof_refcount_exit) - unregister_trace_syscall_exit(prof_syscall_exit); + unregister_trace_sys_exit(prof_syscall_exit); mutex_unlock(&syscall_trace_lock); } -- cgit v1.2.3-70-g09d2 From ab94fcf528d127fcb490175512a8910f37e5b346 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 25 Aug 2009 16:47:16 -0700 Subject: x86: allow "=rm" in native_save_fl() This is a partial revert of f1f029c7bfbf4ee1918b90a431ab823bed812504. "=rm" is allowed in this context, because "pop" is explicitly defined to adjust the stack pointer *before* it evaluates its effective address, if it has one. Thus, we do end up writing to the correct address even if we use an on-stack memory argument. The original reporter for f1f029c7bfbf4ee1918b90a431ab823bed812504 was apparently using a broken x86 simulator. [ Impact: performance ] Signed-off-by: H. Peter Anvin Cc: Gabe Black --- arch/x86/include/asm/irqflags.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index c6ccbe7e81a..9e2b952f810 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -13,14 +13,13 @@ static inline unsigned long native_save_fl(void) unsigned long flags; /* - * Note: this needs to be "=r" not "=rm", because we have the - * stack offset from what gcc expects at the time the "pop" is - * executed, and so a memory reference with respect to the stack - * would end up using the wrong address. + * "=rm" is safe here, because "pop" adjusts the stack before + * it evaluates its effective address -- this is part of the + * documented behavior of the "pop" instruction. */ asm volatile("# __raw_save_flags\n\t" "pushf ; pop %0" - : "=r" (flags) + : "=rm" (flags) : /* no input */ : "memory"); -- cgit v1.2.3-70-g09d2 From d560bc61575efae43595cbcb56d0ba3b9450139c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 25 Aug 2009 12:53:02 -0700 Subject: x86, xen: Suppress WP test on Xen Xen always runs on CPUs which properly support WP enforcement in privileged mode, so there's no need to test for it. This also works around a crash reported by Arnd Hannemann, though I think its just a band-aid for that case. Reported-by: Arnd Hannemann Signed-off-by: Jeremy Fitzhardinge Acked-by: Pekka Enberg Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e90540a46a0..0b755cd7686 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1059,6 +1059,7 @@ asmlinkage void __init xen_start_kernel(void) /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); new_cpu_data.hard_math = 1; + new_cpu_data.wp_works_ok = 1; new_cpu_data.x86_capability[0] = cpuid_edx(1); #endif -- cgit v1.2.3-70-g09d2 From 7adb4df410966dfe43e4815256e3215110648fb8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 25 Aug 2009 21:06:03 -0700 Subject: x86, xen: Initialize cx to suppress warning Initialize cx before calling xen_cpuid(), in order to suppress the "may be used uninitialized in this function" warning. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0b755cd7686..eb33aaa8415 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -215,6 +215,7 @@ static __init void xen_init_cpuid_mask(void) (1 << X86_FEATURE_ACPI)); /* disable ACPI */ ax = 1; + cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); /* cpuid claims we support xsave; try enabling it to see what happens */ -- cgit v1.2.3-70-g09d2 From 8f3e1df48baf728bbb0f242c9dff9c9d7108218a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 24 Aug 2009 21:53:36 +0400 Subject: x86, ioapic: Define IO_APIC_DEFAULT_PHYS_BASE constant We already have APIC_DEFAULT_PHYS_BASE so just to be consistent. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090824175550.927946757@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 3 ++- arch/x86/kernel/mpparse.c | 10 +++++----- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7ddb36ab933..7386bfa4f4b 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -8,7 +8,8 @@ * Ingo Molnar , 1999, 2000 */ -#define APIC_DEFAULT_PHYS_BASE 0xfee00000 +#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 #define APIC_ID 0x20 diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..fcd513bf284 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -482,11 +482,11 @@ static void __init construct_ioapic_table(int mpc_default_type) MP_bus_info(&bus); } - ioapic.type = MP_IOAPIC; - ioapic.apicid = 2; - ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.flags = MPC_APIC_USABLE; - ioapic.apicaddr = 0xFEC00000; + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; MP_ioapic_info(&ioapic); /* -- cgit v1.2.3-70-g09d2 From ffc438366c2660a6a811b94ba33229bf217f8254 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 24 Aug 2009 21:53:39 +0400 Subject: x86, ioapic: Get rid of needless check and simplify ioapic_setup_resources() alloc_bootmem() already panics on allocation failure. There is no need to check the result. Also there is a way to unbind global variable from its body and use it as a parameter which allow us to simplify ioapic_init_mappings as well -- "for" cycle already uses nr_ioapics as a conditional variable and there is no need to check if ioapic_setup_resources was returning NULL again. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090824175551.493629148@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2999f3dd588..d836b4d347e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4053,7 +4053,7 @@ void __init setup_ioapic_dest(void) static struct resource *ioapic_resources; -static struct resource * __init ioapic_setup_resources(void) +static struct resource * __init ioapic_setup_resources(int nr_ioapics) { unsigned long n; struct resource *res; @@ -4069,15 +4069,13 @@ static struct resource * __init ioapic_setup_resources(void) mem = alloc_bootmem(n); res = (void *)mem; - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; + mem += sizeof(struct resource) * nr_ioapics; - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; } ioapic_resources = res; @@ -4091,7 +4089,7 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - ioapic_res = ioapic_setup_resources(); + ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].apicaddr; @@ -4120,11 +4118,9 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; } } -- cgit v1.2.3-70-g09d2 From 5051fd69773d2d044734b78516317a04d3774871 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 24 Aug 2009 21:53:37 +0400 Subject: x86, e820: Guard against array overflowed in __e820_add_region() Better to be paranoid against unpredicted nr_map modifications. Signed-off-by: Cyrill Gorcunov LKML-Reference: <20090824175551.146070377@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7271fa33d79..2e5e0faa99b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -115,7 +115,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, { int x = e820x->nr_map; - if (x == ARRAY_SIZE(e820x->map)) { + if (x >= ARRAY_SIZE(e820x->map)) { printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); return; } -- cgit v1.2.3-70-g09d2 From 295594e9cf6ae2efd73371777aa8feba0f87f42f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 25 Aug 2009 13:44:44 -0700 Subject: x86: Fix vSMP boot crash 2.6.31-rc7 does not boot on vSMP systems: [ 8.501108] CPU31: Thermal monitoring enabled (TM1) [ 8.501127] CPU 31 MCA banks SHD:2 SHD:3 SHD:5 SHD:6 SHD:8 [ 8.650254] CPU31: Intel(R) Xeon(R) CPU E5540 @ 2.53GHz stepping 04 [ 8.710324] Brought up 32 CPUs [ 8.713916] Total of 32 processors activated (162314.96 BogoMIPS). [ 8.721489] ERROR: parent span is not a superset of domain->span [ 8.727686] ERROR: domain->groups does not contain CPU0 [ 8.733091] ERROR: groups don't span domain->span [ 8.737975] ERROR: domain->cpu_power not set [ 8.742416] Ravikiran Thirumalai bisected it to: | commit 2759c3287de27266e06f1f4e82cbd2d65f6a044c | x86: don't call read_apic_id if !cpu_has_apic The problem is that on vSMP systems the CPUID derived initial-APICIDs are overlapping - so we need to fall back on hard_smp_processor_id() which reads the local APIC. Both come from the hardware (influenced by firmware though) so it's a tough call which one to trust. Doing the quirk expresses the vSMP property properly and also does not affect other systems, so we go for this solution instead of a revert. Reported-and-Tested-by: Ravikiran Thirumalai Signed-off-by: Yinghai Lu Cc: Linus Torvalds Cc: Cyrill Gorcunov Cc: Shai Fultheim Cc: Suresh Siddha LKML-Reference: <4A944D3C.5030100@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/probe_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index bc3e880f9b8..fcec2f1d34a 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -44,6 +44,11 @@ static struct apic *apic_probe[] __initdata = { NULL, }; +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -69,6 +74,11 @@ void __init default_setup_apic_routing(void) printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } + if (is_vsmp_box()) { + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + } + /* * Now that apic routing model is selected, configure the * fault handling for intr remapping. -- cgit v1.2.3-70-g09d2 From 680b6cfd3cee30a7d997d49430fb73af84523853 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 26 Aug 2009 16:20:36 +0900 Subject: x86, mce: CE in last bank prevents panic by unknown MCE If MCE handler is called but none of mces_seen have machine check event which might signal the MCE (i.e. event higher than MCE_KEEP_SEVERITY), panic with "Machine check from unknown source" will be taken since the MCE is assumed to be signaled from external agent or so. Usually mces_seen never point MCE_KEEP_SEVERITY event such as CE. But it can happen because initial value of mces_seen is accidentally modified by mce_no_way_out() - in case if mce_no_way_out() run through all banks and the last bank has the CE, mces_seen points the CE and the "panic by unknown" will not be taken. This patch fixes this undesired behavior, and clarifies the logic. Signed-off-by: Hidetoshi Seto Cc: H. Peter Anvin Cc: Andi Kleen Cc: Jin Dongming LKML-Reference: <4A94E244.3020301@jp.fujitsu.com> Signed-off-by: Ingo Molnar Reported-by: Jin Dongming --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 54bd1b2fb4c..325559d1aa5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -612,7 +612,7 @@ out: * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -665,7 +665,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -889,11 +889,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* -- cgit v1.2.3-70-g09d2 From d3a247bfb2c26f5b67367d58af7ad8c2efbbc6c1 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 26 Aug 2009 21:13:24 +0400 Subject: x86, apic: Slim down stack usage in early_init_lapic_mapping() As far as I see there is no external poking of mp_lapic_addr in this procedure which could lead to unpredited changes and require local storage unit for it. Lets use it plain forward. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu LKML-Reference: <20090826171324.GC4548@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3fc3a6c428d..159740decc4 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1562,8 +1562,6 @@ no_apic: #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { - unsigned long phys_addr; - /* * If no local APIC can be found then go out * : it means there is no mpatable and MADT @@ -1571,11 +1569,9 @@ void __init early_init_lapic_mapping(void) if (!smp_found_config) return; - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); + APIC_BASE, mp_lapic_addr); /* * Fetch the APIC ID of the BSP in case we have a -- cgit v1.2.3-70-g09d2 From dd86dda24cc1dc70031a7d9250dc3c0c430a50e2 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 24 Aug 2009 17:40:14 -0400 Subject: tracing: Define NR_syscalls for x86 (32) Add a NR_syscalls #define for x86. This is used in the syscall events tracing code. Todo: make it dynamic like x86_64. NR_syscalls is the usual name used to determine the number of syscalls supported by the current arch. We want to unify the use of this number across archs that support the syscall tracing. This also prepare to move some of the arch code to core code in the syscall tracing area. Signed-off-by: Jason Baron Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: <0f33c0f96d198fccc3ddd9ff7f5334ff5cb42706.1251146513.git.jbaron@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/unistd_32.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 732a3070615..8deaada61bc 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -345,6 +345,8 @@ #ifdef __KERNEL__ +#define NR_syscalls 337 + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT -- cgit v1.2.3-70-g09d2 From a5a2f8e2acb991327952c45a13f5441fc09dffd6 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 26 Aug 2009 12:09:10 -0400 Subject: tracing: Define NR_syscalls for x86_64 Express the available number of syscalls in a standard way by defining NR_syscalls. The common way to define it is to place its definition in asm/unistd.h However, the number of syscalls is defined using __NR_syscall_max in x86-64 after building a dynamic header file "asm-offsets.h" The source file that generates this header, asm-offsets-64.c includes unistd.h, then if we want to express NR_syscalls from __NR_syscall_max in unistd.h only after generating the dynamic header file, we need a watchguard. If unistd.h is included from asm-offsets-64.c, then we are generating asm-offset.h which defines __NR_syscall_max. At this time, we don't want to (we can't) define NR_syscalls, then we do nothing. Otherwise we define NR_syscalls because we know asm-offsets.h has been generated. Signed-off-by: Jason Baron Acked-by: Steven Rostedt Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: <20090826160910.GB2658@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/unistd_64.h | 6 ++++++ arch/x86/kernel/asm-offsets_64.c | 1 + 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 900e1617e67..b9f3c60de5f 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -688,6 +688,12 @@ __SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #endif /* __NO_STUBS */ #ifdef __KERNEL__ + +#ifndef COMPILE_OFFSETS +#include +#define NR_syscalls (__NR_syscall_max + 1) +#endif + /* * "Conditional" syscalls * diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 898ecc47e12..4a6aeedcd96 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -3,6 +3,7 @@ * This code generates raw asm output which is post-processed to extract * and format the required data. */ +#define COMPILE_OFFSETS #include #include -- cgit v1.2.3-70-g09d2 From 57421dbbdc932d65f0e6a41ebb027a2bfe3d0669 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 24 Aug 2009 17:40:22 -0400 Subject: tracing: Convert event tracing code to use NR_syscalls Convert the syscalls event tracing code to use NR_syscalls, instead of FTRACE_SYSCALL_MAX. NR_syscalls is standard accross most arches, and reduces code confusion/complexity. Signed-off-by: Jason Baron Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: <9b4f1a84ecae57cc6599412772efa36f0d2b815b.1251146513.git.jbaron@redhat.com> Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/ftrace.c | 8 ++++---- kernel/trace/trace_syscalls.c | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3cff1214e17..9dbb527e165 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -494,7 +494,7 @@ static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) struct syscall_metadata *syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= FTRACE_SYSCALL_MAX || nr < 0) + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; return syscalls_metadata[nr]; @@ -507,7 +507,7 @@ int syscall_name_to_nr(char *name) if (!syscalls_metadata) return -1; - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { if (syscalls_metadata[i]) { if (!strcmp(syscalls_metadata[i]->name, name)) return i; @@ -533,13 +533,13 @@ static int __init arch_init_ftrace_syscalls(void) unsigned long **psys_syscall_table = &sys_call_table; syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - FTRACE_SYSCALL_MAX, GFP_KERNEL); + NR_syscalls, GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; } - for (i = 0; i < FTRACE_SYSCALL_MAX; i++) { + for (i = 0; i < NR_syscalls; i++) { meta = find_syscall_meta(psys_syscall_table[i]); syscalls_metadata[i] = meta; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb7f600cb02..4f5fae6fad9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -11,8 +11,8 @@ static DEFINE_MUTEX(syscall_trace_lock); static int sys_refcount_enter; static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags) @@ -289,7 +289,7 @@ int reg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -312,7 +312,7 @@ void unreg_event_syscall_enter(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -330,7 +330,7 @@ int reg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -353,7 +353,7 @@ void unreg_event_syscall_exit(void *ptr) name = (char *)ptr; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; @@ -373,8 +373,8 @@ struct trace_event event_syscall_exit = { #ifdef CONFIG_EVENT_PROFILE -static DECLARE_BITMAP(enabled_prof_enter_syscalls, FTRACE_SYSCALL_MAX); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, FTRACE_SYSCALL_MAX); +static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); static int sys_prof_refcount_enter; static int sys_prof_refcount_exit; @@ -420,7 +420,7 @@ int reg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -442,7 +442,7 @@ void unreg_prof_syscall_enter(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); @@ -481,7 +481,7 @@ int reg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return -ENOSYS; mutex_lock(&syscall_trace_lock); @@ -503,7 +503,7 @@ void unreg_prof_syscall_exit(char *name) int num; num = syscall_name_to_nr(name); - if (num < 0 || num >= FTRACE_SYSCALL_MAX) + if (num < 0 || num >= NR_syscalls) return; mutex_lock(&syscall_trace_lock); -- cgit v1.2.3-70-g09d2 From 117226d15850387b55fd01675917ee4fcb9699e8 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 24 Aug 2009 17:40:26 -0400 Subject: tracing: Remove FTRACE_SYSCALL_MAX definitions Remove the FTRACE_SYSCALL_MAX definitions now that we have converted the syscall event tracing code to use NR_syscalls. Signed-off-by: Jason Baron Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Lai Jiangshan Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Mathieu Desnoyers Cc: Jiaying Zhang Cc: Martin Bligh Cc: Li Zefan Cc: Josh Stone Cc: Thomas Gleixner Cc: H. Peter Anwin Cc: Hendrik Brueckner Cc: Heiko Carstens LKML-Reference: Signed-off-by: Frederic Weisbecker --- arch/x86/include/asm/ftrace.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 71136545187..db24c2278be 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,13 +28,6 @@ #endif -/* FIXME: I don't want to stay hardcoded */ -#ifdef CONFIG_X86_64 -# define FTRACE_SYSCALL_MAX 299 -#else -# define FTRACE_SYSCALL_MAX 337 -#endif - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ -- cgit v1.2.3-70-g09d2 From 5fc517466dd3d0fc6d2a5180ca6792e60344d8be Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:32 -0700 Subject: x86, pat: Keep identity maps consistent with mmaps even when pat_disabled Make reserve_memtype internally take care of pat disabled case and fallback to default return values. Remove the specific pat_disabled checks in track_* routines. Change kernel_map_sync_memtype to sync identity map even when pat_disabled. This change ensures that, even for pat_disabled case, we take care of keeping identity map in sync. Before this patch, in pat disabled case, ioremap() keeps the identity maps in sync and other APIs like pci and /dev/mem mmap don't, which is not a very consistent behavior. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb2806..d5af2792d2f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -339,6 +339,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) { if (req_type == -1) *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -577,7 +579,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (!pat_enabled || base >= __pa(high_memory)) + if (base >= __pa(high_memory)) return 0; id_sz = (__pa(high_memory) < base + size) ? @@ -677,9 +679,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -715,9 +714,6 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -743,9 +739,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of -- cgit v1.2.3-70-g09d2 From 279e669b3fc0068cc3509e8e53036999e1e86588 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:33 -0700 Subject: x86, pat: ioremap to follow same PAT restrictions as other PAT users ioremap has this hard-coded check for new type and requested type. That check differs from other PAT users like /dev/mem mmap, remap_pfn_range in only one condition where requested type is UC_MINUS and new type is WC. Under that condition, ioremap fails. But other PAT interfaces succeed with a WC mapping. Change to make ioremap be in sync with other PAT APIs and use the same macro as others. Also changes the error print to KERN_ERR instead of pr_debug. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8a450930834..aeaea8c5b2f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -228,24 +228,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { - pr_debug("Warning: reserve_memtype returned %d\n", retval); + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } if (prot_val != new_prot_val) { - /* - * Do not fallback to certain memory types with certain - * requested type: - * - request is uc-, return cannot be write-back - * - request is uc-, return cannot be write-combine - * - request is write-combine, return cannot be write-back - */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && - (new_prot_val == _PAGE_CACHE_WB || - new_prot_val == _PAGE_CACHE_WC)) || - (prot_val == _PAGE_CACHE_WC && - new_prot_val == _PAGE_CACHE_WB)) { - pr_debug( + if (!is_new_memtype_allowed(prot_val, new_prot_val)) { + printk(KERN_ERR "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), -- cgit v1.2.3-70-g09d2 From 9fd126bc742f74a95d2ba610247712ff05da02fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:34 -0700 Subject: x86, pat: New i/f for driver to request memtype for IO regions Add new routines to request memtype for IO regions. This will currently be a backend for io_mapping_* routines. But, it can also be made available to drivers directly in future, in case it is needed. reserve interface reserves the memory, makes sure we have a compatible memory type available and keeps the identity map in sync when needed. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pat.h | 5 +++++ arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 7af14e512f9..e2c1668dde7 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type); + +void io_free_memtype(resource_size_t start, resource_size_t end); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5af2792d2f..82d097ce309 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -498,6 +498,55 @@ int free_memtype(u64 start, u64 end) } +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, end - start)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, end - start, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { -- cgit v1.2.3-70-g09d2 From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:35 -0700 Subject: x86, pat: Add PAT reserve free to io_mapping* APIs io_mapping_* interfaces were added, mainly for graphics drivers. Make this interface go through the PAT reserve/free, instead of hardcoding WC mapping. This makes sure that there are no aliases due to unconditional WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iomap.h | 9 ++++++--- arch/x86/mm/iomap_32.c | 27 +++++++++++++++++++++++++-- include/linux/io-mapping.h | 17 ++++++++++++----- 3 files changed, 43 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d97..f35eb45d657 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,13 +26,16 @@ #include #include -int -is_io_mapping_possible(resource_size_t base, unsigned long size); - void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); +int +iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +void +iomap_free(resource_size_t base, unsigned long size); + #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121..84e236ce76b 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,7 +21,7 @@ #include #include -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0adb0f91568..97eb928b492 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,23 +49,30 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; - - if (!is_io_mapping_possible(base, size)) - return NULL; + pgprot_t prot; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); if (!iomap) - return NULL; + goto out_err; + + if (iomap_create_wc(base, size, &prot)) + goto out_free; iomap->base = base; iomap->size = size; - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + iomap->prot = prot; return iomap; + +out_free: + kfree(iomap); +out_err: + return NULL; } static inline void io_mapping_free(struct io_mapping *mapping) { + iomap_free(mapping->base, mapping->size); kfree(mapping); } -- cgit v1.2.3-70-g09d2 From 335ef896d4c6639849d79367f0fef9abc06d121b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:36 -0700 Subject: x86, pat: Add rbtree to do quick lookup in memtype tracking PAT memtype tracking uses a linear link list to keep track of IO (non-RAM) regions and their memtypes. The code used a last_accessed pointer as a cache to speedup the lookup. As per discussions with H. Peter Anvin a while back, having a rbtree here will avoid bad performances in pathological cases where we may end up with huge linked list. This may not add any noticable performance speedup in normal case as the number of entires in PAT memtype list tend to be ~20-30 range. The patch removes the "cached_entry" logic as with rbtree we have more generic way of speeding up the lookup. With this patch, we use rbtree to do the quick lookup. We still use linked list as the memtype range tracked can be of different sizes and can overlap in different ways. We also keep track of usage counts with linked list. Example: Multiple ioremaps with different sizes uncached-minus @ 0xfffff00000-0xfffff04000 uncached-minus @ 0xfffff02000-0xfffff03000 And one userlevel mmap and the thread forks a new process uncached-minus @ 0xbf453000-0xbf454000 uncached-minus @ 0xbf453000-0xbf454000 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 106 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 82d097ce309..c90f2420f56 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags) * areas). All the aliases have the same cache attributes of course. * Zero attributes are represented as holes. * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. * - * memtype_lock protects the whole list. + * memtype_lock protects both the linear list and rbtree. */ struct memtype { @@ -160,11 +160,53 @@ struct memtype { u64 end; unsigned long type; struct list_head nd; + struct rb_node rb; }; +static struct rb_root memtype_rbroot = RB_ROOT; static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. @@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) return -EBUSY; } -static struct memtype *cached_entry; -static u64 cached_start; - static int pat_pagerange_is_ram(unsigned long start, unsigned long end) { int ram_page = 0, not_rampage = 0; @@ -382,17 +421,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else + entry = memtype_rb_search(&memtype_rbroot, new->start); + if (likely(entry != NULL)) { + /* To work correctly with list_for_each_entry_continue */ + entry = list_entry(entry->nd.prev, struct memtype, nd); + } else { entry = list_entry(&memtype_list, struct memtype, nd); + } /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -400,8 +441,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -409,8 +448,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); /* * Move to right position in the linked @@ -438,13 +475,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - cached_start = start; - if (where) list_add(&new->nd, where); else list_add_tail(&new->nd, &memtype_list); + memtype_rb_insert(&memtype_rbroot, new); + spin_unlock(&memtype_lock); dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", @@ -456,7 +493,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; + struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; @@ -474,17 +511,46 @@ int free_memtype(u64 start, u64 end) return -EINVAL; spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start > start) { + break; + } + } + + if (!err) + goto unlock_ret; + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); list_del(&entry->nd); kfree(entry); err = 0; break; + } else if (entry->start < start) { + break; } } +unlock_ret: spin_unlock(&memtype_lock); if (err) { -- cgit v1.2.3-70-g09d2 From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:37 -0700 Subject: x86, pat: Generalize the use of page flag PG_uncached Only IA64 was using PG_uncached as of now. We now intend to use this bit in x86 as well, to keep track of memory type of those addresses that have page struct for them. So, generalize the use of that bit across ia64 and x86. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/ia64/Kconfig | 4 ++++ arch/x86/Kconfig | 4 ++++ include/linux/page-flags.h | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d..e6246119932 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR bool select GENERIC_ALLOCATOR +config ARCH_USES_PG_UNCACHED + def_bool y + depends on IA64_UNCACHED_ALLOCATOR + config AUDIT_ARCH bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f7220590..8e159538219 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1414,6 +1414,10 @@ config X86_PAT If unsure, say Y. +config ARCH_USES_PG_UNCACHED + def_bool y + depends on X86_PAT + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e2e5ce54359..2b87acfc5f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,7 +99,7 @@ enum pageflags { #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif __NR_PAGEFLAGS, @@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached) #else PAGEFLAG_FALSE(Uncached) -- cgit v1.2.3-70-g09d2 From f58417409603d62f2eb23db4d2cf6853d84a1698 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:38 -0700 Subject: x86, pat: Use page flags to track memtypes of RAM pages Change reserve_ram_pages_type and free_ram_pages_type to use 2 page flags to track UC_MINUS, WC, WB and default types. Previous RAM tracking just tracked WB or NonWB, which was not complete and did not allow tracking of RAM fully and there was no way to get the actual type reserved by looking at the page flags. We use the memtype_lock spinlock for atomicity in dealing with memtype tracking in struct page. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cacheflush.h | 54 ++++++++++++++++++++++- arch/x86/mm/pat.c | 91 +++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e55dfc1ad45..b54f6afe7ec 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma, memcpy(dst, src, len); } -#define PG_non_WB PG_arch_1 -PAGEFLAG(NonWB, non_WB) +#define PG_WC PG_arch_1 +PAGEFLAG(WC, WC) + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and + * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_UC here. + * + * Caller must hold memtype_lock for atomicity. + */ +static inline unsigned long get_page_memtype(struct page *pg) +{ + if (!PageUncached(pg) && !PageWC(pg)) + return -1; + else if (!PageUncached(pg) && PageWC(pg)) + return _PAGE_CACHE_WC; + else if (PageUncached(pg) && !PageWC(pg)) + return _PAGE_CACHE_UC_MINUS; + else + return _PAGE_CACHE_WB; +} + +static inline void set_page_memtype(struct page *pg, unsigned long memtype) +{ + switch (memtype) { + case _PAGE_CACHE_WC: + ClearPageUncached(pg); + SetPageWC(pg); + break; + case _PAGE_CACHE_UC_MINUS: + SetPageUncached(pg); + ClearPageWC(pg); + break; + case _PAGE_CACHE_WB: + SetPageUncached(pg); + SetPageWC(pg); + break; + default: + case -1: + ClearPageUncached(pg); + ClearPageWC(pg); + break; + } +} +#else +static inline unsigned long get_page_memtype(struct page *pg) { return -1; } +static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } +#endif /* * The set_memory_* API can be used to change various attributes of a virtual diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c90f2420f56..1a9d0f07593 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -288,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { - page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; + unsigned long type; - SetPageNonWB(page); + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } } - return 0; -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + if (new_type) + *new_type = req_type; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - ClearPageNonWB(page); + set_page_memtype(page, req_type); } - - return -EINVAL; + return 0; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -405,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -505,10 +508,16 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); -- cgit v1.2.3-70-g09d2 From 637b86e75f4c255a4446bc0b67ce9d914b9d2d42 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:39 -0700 Subject: x86, pat: Add lookup_memtype to get the current memtype of a paddr Add a new routine lookup_memtype() to get the current memtype based on the PAT reserves and frees. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1a9d0f07593..71aa6f7246c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -573,6 +573,51 @@ unlock_ret: } +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + /** * io_reserve_memtype - Request a memory type mapping for a region of memory * @start: start (physical address) of the region -- cgit v1.2.3-70-g09d2 From 1087637616dd5e96d834164ea462aed6159d039b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:40 -0700 Subject: x86, pat: Lookup the protection from memtype list on vm_insert_pfn() Lookup the reserved memtype during vm_insert_pfn and use that memtype for the new mapping. This takes care or handling of vm_insert_pfn() interface in track_pfn_vma*/untrack_pfn_vma. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 71aa6f7246c..b629f75f73d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -848,11 +848,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the @@ -880,20 +875,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + unsigned long flags; resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; } @@ -908,11 +907,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; -- cgit v1.2.3-70-g09d2 From d886c73cd4cf02a71e1650cbcb6176799d78aac1 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:41 -0700 Subject: x86, pat: Sanity check remap_pfn_range for RAM region Add sanity check for remap_pfn_range of RAM regions using lookup_memtype(). Previously, we did not have anyway to get the type of RAM memory regions as they were tracked using a single bit in page_struct (WB, nonWB). Now we can get the actual type from page struct (WB, WC, UC_MINUS) and make sure the requester gets that type. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b629f75f73d..a6cace0694a 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -783,11 +783,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) -- cgit v1.2.3-70-g09d2 From 57844a8f8e29802f37ad9a0f94eb11d6ae358603 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:48:38 +0200 Subject: x86: Add x86_init infrastructure The upcoming Moorestown support brings the embedded world to x86. The setup code of x86 has already a couple of hooks which are either x86_quirks or paravirt ops. Some of those setup hooks are pretty convoluted like the timer setup and the tsc calibration code. But there are other places which could do with a cleanup. Instead of having inline functions/macros which are modified at compile time I decided to introduce x86_init ops which are unconditional in the code and make it clear that they can be changed either during compile time or in the early boot process. The function pointers are initialized by default functions which can be noops so that the pointer can be called unconditionally in the most cases. This also allows us to remove 32bit/64bit, paravirt and other #ifdeffery. paravirt guests are just a hardware platform in the setup code, so we should treat them as such and not hide all behind multiple layers of indirection and compile time dependencies. It's more obvious that x86_init.timers.timer_init() is a function pointer than the late_time_init = choose_time_init() obscurity. It's also way simpler to grep for x86_init.timers.timer_init and find all the places which modify that function pointer instead of analyzing weak functions, macros and paravirt indirections. Note. This is not a general paravirt_ops replacement. It just will move setup related hooks which are potentially useful for other platform setup purposes as well out of the paravirt domain. Add the base infrastructure without any functionality. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 2 ++ arch/x86/include/asm/x86_init.h | 15 +++++++++++++++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/x86_init.c | 17 +++++++++++++++++ 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/x86_init.h create mode 100644 arch/x86/kernel/x86_init.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 4093d1ed6db..741e2956f3c 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -7,6 +7,8 @@ #ifndef __ASSEMBLY__ +#include + /* * Any setup quirks to be performed? */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h new file mode 100644 index 00000000000..14d11071675 --- /dev/null +++ b/arch/x86/include/asm/x86_init.h @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_PLATFORM_H +#define _ASM_X86_PLATFORM_H + +/** + * struct x86_init_ops - functions for platform specific setup + * + */ +struct x86_init_ops { +}; + +extern struct x86_init_ops x86_init; + +extern void x86_init_noop(void); + +#endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7..313ed6fca9b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,7 +32,7 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o -obj-y += setup.o i8259.o irqinit.o +obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..82d510c9c99 --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2009 Thomas Gleixner + * + * For licencing details see kernel-base/COPYING + */ +#include + +#include + +void __cpuinit x86_init_noop(void) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct __initdata x86_init_ops x86_init = { +}; -- cgit v1.2.3-70-g09d2 From f7cf5a5b8c0e59eac8d30b62271cb0fa52e53ebc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:43:56 +0200 Subject: x86: Add probe_roms to x86_init probe_roms is only used on 32bit. Add it to the x86_init ops and remove the #ifdefs. Default initializer is x86_init_noop() which is overridden in the 32bit boot code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 10 ++++++++++ arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/setup.c | 4 +--- arch/x86/kernel/x86_init.c | 4 ++++ 4 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 14d11071675..75e9e68d635 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,11 +1,21 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +/** + * struct x86_init_resources - platform specific resource related ops + * @probe_roms: probe BIOS roms + * + */ +struct x86_init_resources { + void (*probe_roms)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * */ struct x86_init_ops { + struct x86_init_resources resources; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3f8579f8d42..4049353152c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,6 +29,9 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + reserve_ebda_region(); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef..5796eb158d4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -835,9 +835,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor(&boot_cpu_data); -#ifdef CONFIG_X86_32 - probe_roms(); -#endif + x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ insert_resource(&iomem_resource, &code_resource); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 82d510c9c99..88883f8006c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,4 +14,8 @@ void __cpuinit x86_init_noop(void) { } * for standard PC hardware. */ struct __initdata x86_init_ops x86_init = { + + .resources = { + .probe_roms = x86_init_noop, + }, }; -- cgit v1.2.3-70-g09d2 From 8fee697d990c54976c8dc167270633299e2515d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:55:50 +0200 Subject: x86: Add request_standard_resources to x86_init The 32bit and the 64bit code are slighty different in the reservation of standard resources. Also the upcoming Moorestown support needs its own version of that. Add it to x86_init_ops and initialize it with the 64bit default. 32bit overrides it in early boot. Now moorestown can add it's own override w/o sprinkling the code with more #ifdefs Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 3 +++ arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/head32.c | 1 + arch/x86/kernel/setup.c | 28 ++++++++++++++++------------ arch/x86/kernel/x86_init.c | 3 ++- 5 files changed, 25 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 741e2956f3c..19769ac6061 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -88,6 +88,9 @@ extern unsigned long saved_video_mode; #define paravirt_post_allocator_init() do {} while (0) #endif +extern void reserve_standard_io_resources(void); +extern void i386_reserve_resources(void); + #ifndef _SETUP /* diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 75e9e68d635..d0d9be25ed9 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,10 +4,13 @@ /** * struct x86_init_resources - platform specific resource related ops * @probe_roms: probe BIOS roms + * @reserve_resources: reserve the standard resources for the + * platform * */ struct x86_init_resources { void (*probe_roms)(void); + void (*reserve_resources)(void); }; /** diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 4049353152c..d91c37c0206 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ void __init i386_start_kernel(void) #endif /* Initilize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; reserve_ebda_region(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5796eb158d4..c2a8090e831 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -171,13 +171,6 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; /* common cpu data for all cpus */ @@ -605,7 +598,7 @@ static struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -static void __init reserve_standard_io_resources(void) +void __init reserve_standard_io_resources(void) { int i; @@ -1013,10 +1006,7 @@ void __init setup_arch(char **cmdline_p) e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); -#ifdef CONFIG_X86_32 - request_resource(&iomem_resource, &video_ram_resource); -#endif - reserve_standard_io_resources(); + x86_init.resources.reserve_resources(); e820_setup_gap(); @@ -1102,4 +1092,18 @@ void __init x86_quirk_time_init(void) irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +void __init i386_reserve_resources(void) +{ + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); +} + #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 88883f8006c..68c093b67ec 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,7 +5,7 @@ */ #include -#include +#include void __cpuinit x86_init_noop(void) { } @@ -17,5 +17,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, + .reserve_resources = reserve_standard_io_resources, }, }; -- cgit v1.2.3-70-g09d2 From 816c25e7d4fb6fd40022a376e8b7f45b1edf5a89 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 14:36:27 +0200 Subject: x86: Add reserve_ebda_region to x86_init_ops reserve_ebda_region needs to be called befor start_kernel. Moorestown needs to override it. Make it a x86_init_ops function and initialize it with the default reserve_ebda_region. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/head32.c | 4 ++-- arch/x86/kernel/head64.c | 3 +-- arch/x86/kernel/x86_init.c | 2 ++ 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index d0d9be25ed9..8a971cb3dd3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -6,11 +6,13 @@ * @probe_roms: probe BIOS roms * @reserve_resources: reserve the standard resources for the * platform + * @reserve_ebda_region: reserve the extended bios data area * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); + void (*reserve_ebda_region)(void); }; /** diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d91c37c0206..921a23b6c14 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include void __init i386_start_kernel(void) @@ -33,7 +33,7 @@ void __init i386_start_kernel(void) x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; - reserve_ebda_region(); + x86_init.resources.reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 70eaa852c73..cead8149c3d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -23,7 +23,6 @@ #include #include #include -#include #include static void __init zap_identity_mappings(void) @@ -112,7 +111,7 @@ void __init x86_64_start_reservations(char *real_mode_data) } #endif - reserve_ebda_region(); + x86_init.resources.reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 68c093b67ec..1fff49a6858 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,6 +5,7 @@ */ #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -18,5 +19,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, + .reserve_ebda_region = reserve_ebda_region, }, }; -- cgit v1.2.3-70-g09d2 From 6b18ae3e2ff62daa9f181401759161dd8de0aadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:19:54 +0200 Subject: x86: Move memory_setup to x86_init_ops memory_setup is overridden by x86_quirks and by paravirts with weak functions and quirks. Unify the whole mess and make it an unconditional x86_init_ops function which defaults to the standard function and can be overridden by the early platform code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/e820.h | 2 -- arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/e820.c | 19 +------------------ arch/x86/kernel/paravirt.c | 6 ------ arch/x86/kernel/visws_quirks.c | 3 ++- arch/x86/kernel/x86_init.c | 2 ++ arch/x86/lguest/boot.c | 3 ++- arch/x86/xen/enlighten.c | 3 ++- 11 files changed, 11 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7ecba4d8508..40b4e614fe7 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -126,8 +126,6 @@ extern void e820_reserve_resources(void); extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -extern char *machine_specific_memory_setup(void); -extern char *memory_setup(void); #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 2b3371bae29..6d668968b6b 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -81,7 +81,6 @@ struct pv_init_ops { /* Basic arch-specific setup */ void (*arch_setup)(void); - char *(*memory_setup)(void); void (*post_allocator_init)(void); /* Print a banner to identify the environment */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 19769ac6061..9cba9d6ca88 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -22,7 +22,6 @@ struct x86_quirks { int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); - char * (*arch_memory_setup)(void); int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8a971cb3dd3..6c084f2a6c3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -7,12 +7,14 @@ * @reserve_resources: reserve the standard resources for the * platform * @reserve_ebda_region: reserve the extended bios data area + * @memory_setup: platform specific memory setup * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); void (*reserve_ebda_region)(void); + char *(*memory_setup)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ca96e68f0d2..403c062f69e 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -260,7 +260,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, .arch_pre_intr_init = NULL, - .arch_memory_setup = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, .mach_get_smp_config = NULL, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 5cb5725b2ba..0d804b907e8 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void) return who; } -char *__init __attribute__((weak)) machine_specific_memory_setup(void) -{ - if (x86_quirks->arch_memory_setup) { - char *who = x86_quirks->arch_memory_setup(); - - if (who) - return who; - } - return default_machine_specific_memory_setup(); -} - -/* Overridden in paravirt.c if CONFIG_PARAVIRT */ -char * __init __attribute__((weak)) memory_setup(void) -{ - return machine_specific_memory_setup(); -} - void __init setup_memory_map(void) { char *who; - who = memory_setup(); + who = x86_init.resources.memory_setup(); memcpy(&e820_saved, &e820, sizeof(struct e820map)); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..532c9a2626c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -60,11 +60,6 @@ static void __init default_banner(void) pv_info.name); } -char *memory_setup(void) -{ - return pv_init_ops.memory_setup(); -} - /* Simple instruction patching code. */ #define DEF_NATIVE(ops, name, code) \ extern const char start_##ops##_##name[], end_##ops##_##name[]; \ @@ -322,7 +317,6 @@ struct pv_init_ops pv_init_ops = { .patch = native_patch, .banner = default_banner, .arch_setup = paravirt_nop, - .memory_setup = machine_specific_memory_setup, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31ffc24eec4..97c670df1ae 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -239,7 +239,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, .arch_pre_intr_init = visws_pre_intr_init, - .arch_memory_setup = visws_memory_setup, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, .mach_get_smp_config = visws_get_smp_config, @@ -263,6 +262,8 @@ void __init visws_early_detect(void) */ x86_quirks = &visws_x86_quirks; + x86_init.resources.memory_setup = visws_memory_setup; + /* * Install reboot quirks: */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1fff49a6858..1965bff3489 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -7,6 +7,7 @@ #include #include +#include void __cpuinit x86_init_noop(void) { } @@ -20,5 +21,6 @@ struct __initdata x86_init_ops x86_init = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, .reserve_ebda_region = reserve_ebda_region, + .memory_setup = default_machine_specific_memory_setup, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index d677fa9ca65..11445c176de 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1270,7 +1270,6 @@ __init void lguest_init(void) pv_irq_ops.safe_halt = lguest_safe_halt; /* Setup operations */ - pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; /* Intercepts of various CPU instructions */ @@ -1325,6 +1324,8 @@ __init void lguest_init(void) pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; + x86_init.resources.memory_setup = lguest_memory_setup; + /* * Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e90540a46a0..50b20c64f0b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, .banner = xen_banner, - .memory_setup = xen_memory_setup, .arch_setup = xen_arch_setup, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,8 @@ asmlinkage void __init xen_start_kernel(void) pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + x86_init.resources.memory_setup = xen_memory_setup; + #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit -- cgit v1.2.3-70-g09d2 From f4848472cd99487e182b64fb2a5d0e4fedbe86ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:05:01 +0200 Subject: x86: Sanitize smp_record and move it to x86_init_ops The x86 quirkification introduced an extra ugly hackery with a variable pointer in the mpparse code. If the pointer is initialized then it is dereferenced and the variable set to 0 or incremented. Create a x86_init_ops function and let the affected numaq code hold the function. Default init is a setup noop. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 12 +++++++++++- arch/x86/kernel/apic/numaq_32.c | 19 ++++++++++++++++--- arch/x86/kernel/mpparse.c | 6 ++---- arch/x86/kernel/x86_init.c | 5 +++++ 5 files changed, 34 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9cba9d6ca88..bbf2dfd59b4 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -25,7 +25,6 @@ struct x86_quirks { int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - int *mpc_record; int (*mpc_apic_id)(struct mpc_cpu *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 6c084f2a6c3..10b297b1881 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,14 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +/** + * struct x86_init_mpparse - platform specific mpparse ops + * @mpc_record: platform specific mpc record accounting + */ +struct x86_init_mpparse { + void (*mpc_record)(unsigned int mode); +}; + /** * struct x86_init_resources - platform specific resource related ops * @probe_roms: probe BIOS roms @@ -22,11 +30,13 @@ struct x86_init_resources { * */ struct x86_init_ops { - struct x86_init_resources resources; + struct x86_init_resources resources; + struct x86_init_mpparse mpparse; }; extern struct x86_init_ops x86_init; extern void x86_init_noop(void); +extern void x86_init_uint_noop(unsigned int unused); #endif diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 403c062f69e..b5f0b1dc7dd 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -66,7 +66,6 @@ struct mpc_trans { unsigned short trans_reserved; }; -/* x86_quirks member */ static int mpc_record; static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; @@ -177,6 +176,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m) quad_local_to_mp_bus_id[quad][local] = m->busid; } +/* + * Called from mpparse code. + * mode = 0: prescan + * mode = 1: one mpc entry scanned + */ +static void numaq_mpc_record(unsigned int mode) +{ + if (!mode) + mpc_record = 0; + else + mpc_record++; +} + static void __init MP_translation_info(struct mpc_trans *m) { printk(KERN_INFO @@ -264,7 +276,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_record = &mpc_record, .mpc_apic_id = mpc_apic_id, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, @@ -285,8 +296,10 @@ static __init void early_check_numaq(void) if (smp_found_config) early_get_smp_config(); - if (found_numaq) + if (found_numaq) { x86_quirks = &numaq_x86_quirks; + x86_init.mpparse.mpc_record = numaq_mpc_record; + } } int __init get_memcfg_numaq(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 651c93b2886..b2179fdf0ff 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -320,8 +320,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) /* * Now process the configuration blocks. */ - if (x86_quirks->mpc_record) - *x86_quirks->mpc_record = 0; + x86_init.mpparse.mpc_record(0); while (count < mpc->length) { switch (*mpt) { @@ -353,8 +352,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) count = mpc->length; break; } - if (x86_quirks->mpc_record) - (*x86_quirks->mpc_record)++; + x86_init.mpparse.mpc_record(1); } #ifdef CONFIG_X86_BIGSMP diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 1965bff3489..83bd5db376b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -10,6 +10,7 @@ #include void __cpuinit x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } /* * The platform setup functions are preset with the default functions @@ -23,4 +24,8 @@ struct __initdata x86_init_ops x86_init = { .reserve_ebda_region = reserve_ebda_region, .memory_setup = default_machine_specific_memory_setup, }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + }, }; -- cgit v1.2.3-70-g09d2 From de93410310952fb7b705f784ef22493c8362dbe8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:27:29 +0200 Subject: x86: Move ioapic_ids_setup to x86_init_ops 32bit and also the numaq code have special requirements on the ioapic_id setup. Convert it to a x86_init_ops function and get rid of the quirks and #ifdefs Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 3 ++- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 11 ++++------- arch/x86/kernel/apic/numaq_32.c | 8 +------- arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/x86_init.c | 1 + 7 files changed, 13 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 330ee807f89..2b8aeb89933 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -177,12 +177,13 @@ extern int setup_ioapic_entry(int apic, int irq, int polarity, int vector, int pin); extern void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e); +extern void setup_ioapic_ids_from_mpc(void); #else /* !CONFIG_X86_IO_APIC */ #define io_apic_assign_pci_irqs 0 +#define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; static inline void ioapic_init_mappings(void) { } static inline void ioapic_insert_resources(void) { } - static inline void probe_nr_irqs_gsi(void) { } #endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index bbf2dfd59b4..cc8b4b0550e 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -30,7 +30,6 @@ struct x86_quirks { void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, unsigned short oemsize); - int (*setup_ioapic_ids)(void); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 10b297b1881..65985730b37 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -4,9 +4,11 @@ /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting + * @setup_ioapic_ids: platform specific ioapic id override */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); + void (*setup_ioapic_ids)(void); }; /** diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d2ed6c5ddc8..5f4687187ce 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2014,7 +2014,7 @@ void disable_IO_APIC(void) * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 */ -static void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -2023,9 +2023,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + if (acpi_ioapic) return; - /* * Don't check I/O APIC IDs for xAPIC systems. They have * no meaning without the serial APIC bus. @@ -3061,10 +3060,8 @@ void __init setup_IO_APIC(void) /* * Set up IO-APIC IRQ routing. */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif + x86_init.mpparse.setup_ioapic_ids(); + sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index b5f0b1dc7dd..f3717659265 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -262,12 +262,6 @@ static void __init } } -static int __init numaq_setup_ioapic_ids(void) -{ - /* so can skip it */ - return 1; -} - static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -280,7 +274,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, - .setup_ioapic_ids = numaq_setup_ioapic_ids, }; static __init void early_check_numaq(void) @@ -299,6 +292,7 @@ static __init void early_check_numaq(void) if (found_numaq) { x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; + x86_init.mpparse.setup_ioapic_ids = x86_init_noop; } } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 921a23b6c14..a21398fac4f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include void __init i386_start_kernel(void) { @@ -32,6 +34,7 @@ void __init i386_start_kernel(void) /* Initilize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; x86_init.resources.reserve_ebda_region(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 83bd5db376b..f4a32b3ab02 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -27,5 +27,6 @@ struct __initdata x86_init_ops x86_init = { .mpparse = { .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, }, }; -- cgit v1.2.3-70-g09d2 From fd6c6661492226bb82f422157c535ac573cbecbd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:41:58 +0200 Subject: x86: Move mpc_apic_id to x86_init_ops The mpc_apic_id setup is handled by a x86_quirk. Make it a x86_init_ops function with a default implementation. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 ++ arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 10 ++++++---- arch/x86/kernel/x86_init.c | 2 ++ 6 files changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e2a1bb6d71e..03c6a92bfd4 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -62,10 +62,12 @@ extern void get_smp_config(void); extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; +extern int default_mpc_apic_id(struct mpc_cpu *m); #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 +#define default_mpc_apic_id NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index cc8b4b0550e..7c7f44f3e4d 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -12,7 +12,6 @@ /* * Any setup quirks to be performed? */ -struct mpc_cpu; struct mpc_bus; struct mpc_oemtable; @@ -25,7 +24,6 @@ struct x86_quirks { int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - int (*mpc_apic_id)(struct mpc_cpu *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 65985730b37..f2be2a78018 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,14 +1,18 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +struct mpc_cpu; + /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting * @setup_ioapic_ids: platform specific ioapic id override + * @mpc_apic_id: platform specific mpc apic id assignment */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); + int (*mpc_apic_id)(struct mpc_cpu *m); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f3717659265..222413f7e79 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_apic_id = mpc_apic_id, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; + x86_init.mpparse.mpc_apic_id = mpc_apic_id; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index b2179fdf0ff..04560860a72 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + static void __init MP_processor_info(struct mpc_cpu *m) { int apicid; @@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) return; } - if (x86_quirks->mpc_apic_id) - apicid = x86_quirks->mpc_apic_id(m); - else - apicid = m->apicid; + apicid = x86_init.mpparse.mpc_apic_id(m); if (m->cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f4a32b3ab02..08749f2612f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -28,5 +29,6 @@ struct __initdata x86_init_ops x86_init = { .mpparse = { .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, }, }; -- cgit v1.2.3-70-g09d2 From 72302142e165313ee58af81bd76708c12b58d7ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:18:32 +0200 Subject: x86: Move smp_read_mpc_oem to x86_init_ops. Move smp_read_mpc_oem from quirks to x86_init. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 ++ arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/apic/numaq_32.c | 6 +++--- arch/x86/kernel/mpparse.c | 8 ++++---- arch/x86/kernel/x86_init.c | 1 + 6 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 03c6a92bfd4..5de8e92be2d 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -63,11 +63,13 @@ extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); +extern void default_smp_read_mpc_oem(struct mpc_table *mpc); #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL +#define default_smp_read_mpc_oem NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 7c7f44f3e4d..adb5d44d990 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -13,7 +13,6 @@ * Any setup quirks to be performed? */ struct mpc_bus; -struct mpc_oemtable; struct x86_quirks { int (*arch_pre_time_init)(void); @@ -26,8 +25,6 @@ struct x86_quirks { void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_bus *m); - void (*smp_read_mpc_oem)(struct mpc_oemtable *oemtable, - unsigned short oemsize); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f2be2a78018..fc0eef2f5fd 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,17 +2,20 @@ #define _ASM_X86_PLATFORM_H struct mpc_cpu; +struct mpc_table; /** * struct x86_init_mpparse - platform specific mpparse ops * @mpc_record: platform specific mpc record accounting * @setup_ioapic_ids: platform specific ioapic id override * @mpc_apic_id: platform specific mpc apic id assignment + * @smp_read_mpc_oem: platform specific oem mpc table setup */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); int (*mpc_apic_id)(struct mpc_cpu *m); + void (*smp_read_mpc_oem)(struct mpc_table *mpc); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 222413f7e79..1bd3b0ed240 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -218,9 +218,9 @@ static int __init mpf_checksum(unsigned char *mp, int len) /* * Read/parse the MPC oem tables */ -static void __init - smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize) +static void __init smp_read_mpc_oem(struct mpc_table *mpc) { + struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; int count = sizeof(*oemtable); /* the header size */ unsigned char *oemptr = ((unsigned char *)oemtable) + count; @@ -272,7 +272,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mach_find_smp_config = NULL, .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, - .smp_read_mpc_oem = smp_read_mpc_oem, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; + x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 04560860a72..45abdf63edc 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -293,6 +293,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) 1, mpc, mpc->length, 1); } +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -314,10 +316,8 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; - if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { - struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; - x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize); - } + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); /* * Now process the configuration blocks. diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 08749f2612f..fb5d93c077d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -30,5 +30,6 @@ struct __initdata x86_init_ops x86_init = { .mpc_record = x86_init_uint_noop, .setup_ioapic_ids = x86_init_noop, .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, }, }; -- cgit v1.2.3-70-g09d2 From 52fdb5684660f9fd7129f7bbbe279a02893bacb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:45:33 +0200 Subject: x86: Move mpc_oem_pci_bus to x86_init_ops Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 3 +++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 4 ++-- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index adb5d44d990..fd2267baba1 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -24,7 +24,6 @@ struct x86_quirks { int (*mach_find_smp_config)(unsigned int reserve); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); - void (*mpc_oem_pci_bus)(struct mpc_bus *m); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index fc0eef2f5fd..404e2d2b06d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +struct mpc_bus; struct mpc_cpu; struct mpc_table; @@ -10,12 +11,14 @@ struct mpc_table; * @setup_ioapic_ids: platform specific ioapic id override * @mpc_apic_id: platform specific mpc apic id assignment * @smp_read_mpc_oem: platform specific oem mpc table setup + * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); void (*setup_ioapic_ids)(void); int (*mpc_apic_id)(struct mpc_cpu *m); void (*smp_read_mpc_oem)(struct mpc_table *mpc); + void (*mpc_oem_pci_bus)(struct mpc_bus *m); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 1bd3b0ed240..feebe8eed7d 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -271,7 +271,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, .mpc_oem_bus_info = mpc_oem_bus_info, - .mpc_oem_pci_bus = mpc_oem_pci_bus, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; + x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 45abdf63edc..72e1140723c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -98,8 +98,8 @@ static void __init MP_bus_info(struct mpc_bus *m) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { - if (x86_quirks->mpc_oem_pci_bus) - x86_quirks->mpc_oem_pci_bus(m); + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); clear_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) -- cgit v1.2.3-70-g09d2 From 90e1c6969d8711edb888a00ec54c74370f125c8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 12:34:47 +0200 Subject: x86: Move oem_bus_info to x86_init_ops Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 6 ++++++ arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/mpparse.c | 14 ++++++++------ arch/x86/kernel/x86_init.c | 1 + 6 files changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 5de8e92be2d..e3c579efde7 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -64,12 +64,18 @@ extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); extern void default_smp_read_mpc_oem(struct mpc_table *mpc); +# ifdef CONFIG_X86_IO_APIC +extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); +# else +# define default_mpc_oem_bus_info NULL +# endif #else static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL +#define default_mpc_oem_bus_info NULL #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index fd2267baba1..6121a8ac7b0 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -12,7 +12,6 @@ /* * Any setup quirks to be performed? */ -struct mpc_bus; struct x86_quirks { int (*arch_pre_time_init)(void); @@ -22,8 +21,6 @@ struct x86_quirks { int (*arch_trap_init)(void); int (*mach_get_smp_config)(unsigned int early); int (*mach_find_smp_config)(unsigned int reserve); - - void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 404e2d2b06d..2833a873a90 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -12,6 +12,7 @@ struct mpc_table; * @mpc_apic_id: platform specific mpc apic id assignment * @smp_read_mpc_oem: platform specific oem mpc table setup * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) + * @mpc_oem_bus_info: platform specific mpc bus info */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); @@ -19,6 +20,7 @@ struct x86_init_mpparse { int (*mpc_apic_id)(struct mpc_cpu *m); void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); + void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index feebe8eed7d..700273dca68 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -270,7 +270,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_trap_init = NULL, .mach_get_smp_config = NULL, .mach_find_smp_config = NULL, - .mpc_oem_bus_info = mpc_oem_bus_info, }; static __init void early_check_numaq(void) @@ -293,6 +292,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; + x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; } } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 72e1140723c..a42f23f1dc7 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -72,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m) } #ifdef CONFIG_X86_IO_APIC -static void __init MP_bus_info(struct mpc_bus *m) +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) { - char str[7]; memcpy(str, m->bustype, 6); str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; - if (x86_quirks->mpc_oem_bus_info) - x86_quirks->mpc_oem_bus_info(m, str); - else - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); + x86_init.mpparse.mpc_oem_bus_info(m, str); #if MAX_MP_BUSSES < 256 if (m->busid >= MAX_MP_BUSSES) { diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index fb5d93c077d..27685edc546 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,5 +31,6 @@ struct __initdata x86_init_ops x86_init = { .setup_ioapic_ids = x86_init_noop, .mpc_apic_id = default_mpc_apic_id, .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, }, }; -- cgit v1.2.3-70-g09d2 From 7285dd7fd375763bfb8ab1ac9cf3f1206f503c16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Aug 2009 20:25:24 +0200 Subject: clocksource: Resolve cpu hotplug dead lock with TSC unstable Martin Schwidefsky analyzed it: To register a clocksource the clocksource_mutex is acquired and if necessary timekeeping_notify is called to install the clocksource as the timekeeper clock. timekeeping_notify uses stop_machine which needs to take cpu_add_remove_lock mutex. Starting a new cpu is done with the cpu_add_remove_lock mutex held. native_cpu_up checks the tsc of the new cpu and if the tsc is no good clocksource_change_rating is called. Which needs the clocksource_mutex and the deadlock is complete. The solution is to replace the TSC via the clocksource watchdog mechanism. Mark the TSC as unstable and schedule the watchdog work so it gets removed in the watchdog thread context. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: Martin Schwidefsky Cc: John Stultz --- arch/x86/kernel/tsc.c | 8 +++++--- include/linux/clocksource.h | 1 + kernel/time/clocksource.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 36 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 968425422c4..fc3672a303d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -767,12 +767,14 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - printk("Marking TSC unstable due to %s\n", reason); + printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) - clocksource_change_rating(&clocksource_tsc, 0); - else + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; clocksource_tsc.rating = 0; + } } } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9ea40ff26f0..83d2fbd81b9 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -277,6 +277,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_resume(void); extern struct clocksource * __init __weak clocksource_default_clock(void); +extern void clocksource_mark_unstable(struct clocksource *cs); #ifdef CONFIG_GENERIC_TIME_VSYSCALL extern void update_vsyscall(struct timespec *ts, struct clocksource *c); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0c86ad6e9f..a0af4ffcb6e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -149,15 +149,42 @@ static void clocksource_watchdog_work(struct work_struct *work) kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) +static void __clocksource_unstable(struct clocksource *cs) { - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; schedule_work(&watchdog_work); } +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; -- cgit v1.2.3-70-g09d2 From 47d25003cbd9e9030a95f7ccc4e70fec6aa7b844 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 28 Aug 2009 14:11:57 +0100 Subject: x86: Fix earlyprintk=dbgp for machines without NX Since parse_early_param() may (e.g. for earlyprintk=dbgp) involve calls to page table manipulation functions (here set_fixmap_nocache()), NX hardware support must be determined before calling that function (so that __supported_pte_mask gets properly set up). But the call after parse_early_param() can also not go away, as that will honor eventual command line specified disabling of the NX functionality. ( This will then just result in whatever mappings got established during parse_early_param() having the NX bit set despite it being disabled on the command line, but I think that's tolerable). Signed-off-by: Jan Beulich Cc: Yinghai Lu LKML-Reference: <4A97F3BD02000078000121B9@vpn.id2.novell.com> [ merged to x86/pat to resolve a conflict. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 02643cc3bf2..eb1f1e6e52b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -714,6 +714,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); /* VMI may relocate the fixmap; do this before touching ioremap area */ -- cgit v1.2.3-70-g09d2 From 23386d63bbb3199cf247313ec088878d72debcfd Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Sat, 29 Aug 2009 18:27:18 +0200 Subject: x86: Detect stack protector for i386 builds on x86_64 Stack protector support was not detected when building with ARCH=i386 on x86_64 systems: arch/x86/Makefile:80: stack protector enabled but no compiler support The "-m32" argument needs to be passed to the detection script. Signed-off-by: Michal Schmidt Cc: Tejun Heo Cc: Jeremy Fitzhardinge Cc: Arjan van de Ven LKML-Reference: <20090829182718.10f566b1@leela> Signed-off-by: Ingo Molnar -- --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1b68659c41b..5e7db44d709 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -72,7 +72,7 @@ endif ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) + ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) stackp-y := -fstack-protector stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all KBUILD_CFLAGS += $(stackp-y) -- cgit v1.2.3-70-g09d2 From b3f1b617f49447df6c3f5fac9c225aaea8b724ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 11:11:52 +0200 Subject: x86: Move get/find_smp_config to x86_init_ops Replace the quirk machinery by a x86_init_ops function which defaults to the standard implementation. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 37 ++++++++++++++++++++++++++++++------- arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 2 -- arch/x86/kernel/mpparse.c | 33 ++------------------------------- arch/x86/kernel/setup.c | 2 -- arch/x86/kernel/visws_quirks.c | 14 ++++---------- arch/x86/kernel/x86_init.c | 2 ++ 8 files changed, 42 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index e3c579efde7..79c94500c0b 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -4,6 +4,7 @@ #include #include +#include extern int apic_version[MAX_APICS]; extern int pic_mode; @@ -41,9 +42,6 @@ extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; #endif /* CONFIG_X86_64 */ -extern void early_find_smp_config(void); -extern void early_get_smp_config(void); - #if defined(CONFIG_MCA) || defined(CONFIG_EISA) extern int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -52,14 +50,36 @@ extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; extern unsigned int max_physical_apicid; -extern int smp_found_config; extern int mpc_default_type; extern unsigned long mp_lapic_addr; -extern void get_smp_config(void); +#ifdef CONFIG_X86_LOCAL_APIC +extern int smp_found_config; +#else +# define smp_found_config 0 +#endif + +static inline void get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(0); +} + +static inline void early_get_smp_config(void) +{ + x86_init.mpparse.get_smp_config(1); +} + +static inline void find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(1); +} + +static inline void early_find_smp_config(void) +{ + x86_init.mpparse.find_smp_config(0); +} #ifdef CONFIG_X86_MPPARSE -extern void find_smp_config(void); extern void early_reserve_e820_mpc_new(void); extern int enable_update_mptable; extern int default_mpc_apic_id(struct mpc_cpu *m); @@ -69,13 +89,16 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str); # else # define default_mpc_oem_bus_info NULL # endif +extern void default_find_smp_config(unsigned int reserve); +extern void default_get_smp_config(unsigned int early); #else -static inline void find_smp_config(void) { } static inline void early_reserve_e820_mpc_new(void) { } #define enable_update_mptable 0 #define default_mpc_apic_id NULL #define default_smp_read_mpc_oem NULL #define default_mpc_oem_bus_info NULL +#define default_find_smp_config x86_init_uint_noop +#define default_get_smp_config x86_init_uint_noop #endif void __cpuinit generic_processor_info(int apicid, int version); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 6121a8ac7b0..345a2551af9 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -19,8 +19,6 @@ struct x86_quirks { int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); - int (*mach_get_smp_config)(unsigned int early); - int (*mach_find_smp_config)(unsigned int reserve); }; extern void x86_quirk_intr_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 2833a873a90..e0d4729c905 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -13,6 +13,8 @@ struct mpc_table; * @smp_read_mpc_oem: platform specific oem mpc table setup * @mpc_oem_pci_bus: platform specific pci bus setup (default NULL) * @mpc_oem_bus_info: platform specific mpc bus info + * @find_smp_config: find the smp configuration + * @get_smp_config: get the smp configuration */ struct x86_init_mpparse { void (*mpc_record)(unsigned int mode); @@ -21,6 +23,8 @@ struct x86_init_mpparse { void (*smp_read_mpc_oem)(struct mpc_table *mpc); void (*mpc_oem_pci_bus)(struct mpc_bus *m); void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); + void (*find_smp_config)(unsigned int reserve); + void (*get_smp_config)(unsigned int early); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 700273dca68..3dd5fd76534 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -268,8 +268,6 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_intr_init = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, - .mach_get_smp_config = NULL, - .mach_find_smp_config = NULL, }; static __init void early_check_numaq(void) diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a42f23f1dc7..75357647b6e 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -610,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) /* * Scan the memory blocks for an SMP configuration block. */ -static void __init __get_smp_config(unsigned int early) +void __init default_get_smp_config(unsigned int early) { struct mpf_intel *mpf = mpf_found; @@ -627,11 +627,6 @@ static void __init __get_smp_config(unsigned int early) if (acpi_lapic && acpi_ioapic) return; - if (x86_quirks->mach_get_smp_config) { - if (x86_quirks->mach_get_smp_config(early)) - return; - } - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->specification); #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) @@ -672,16 +667,6 @@ static void __init __get_smp_config(unsigned int early) */ } -void __init early_get_smp_config(void) -{ - __get_smp_config(1); -} - -void __init get_smp_config(void) -{ - __get_smp_config(0); -} - static void __init smp_reserve_bootmem(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); @@ -747,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -static void __init __find_smp_config(unsigned int reserve) +void __init default_find_smp_config(unsigned int reserve) { unsigned int address; - if (x86_quirks->mach_find_smp_config) { - if (x86_quirks->mach_find_smp_config(reserve)) - return; - } /* * FIXME: Linux assumes you have 640K of base ram.. * this continues the error... @@ -789,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve) smp_scan_config(address, 0x400, reserve); } -void __init early_find_smp_config(void) -{ - __find_smp_config(0); -} - -void __init find_smp_config(void) -{ - __find_smp_config(1); -} - #ifdef CONFIG_X86_IO_APIC static u8 __initdata irq_used[MAX_IRQ_SOURCES]; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c2a8090e831..54043cb7ba6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -981,13 +981,11 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); -#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); -#endif prefill_possible_map(); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 97c670df1ae..31e828118f8 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -156,12 +156,8 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config(unsigned int early) +static void __init visws_get_smp_config(unsigned int early) { - /* - * Prevent MP-table parsing by the generic code: - */ - return 1; } /* @@ -208,7 +204,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) apic_version[m->apicid] = ver; } -static int __init visws_find_smp_config(unsigned int reserve) +static void __init visws_find_smp_config(unsigned int reserve) { struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -230,8 +226,6 @@ static int __init visws_find_smp_config(unsigned int reserve) MP_processor_info(mp++); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - return 1; } static int visws_trap_init(void); @@ -241,8 +235,6 @@ static struct x86_quirks visws_x86_quirks __initdata = { .arch_pre_intr_init = visws_pre_intr_init, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, - .mach_get_smp_config = visws_get_smp_config, - .mach_find_smp_config = visws_find_smp_config, }; void __init visws_early_detect(void) @@ -263,6 +255,8 @@ void __init visws_early_detect(void) x86_quirks = &visws_x86_quirks; x86_init.resources.memory_setup = visws_memory_setup; + x86_init.mpparse.get_smp_config = visws_get_smp_config; + x86_init.mpparse.find_smp_config = visws_find_smp_config; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 27685edc546..3488fb62ac0 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -32,5 +32,7 @@ struct __initdata x86_init_ops x86_init = { .mpc_apic_id = default_mpc_apic_id, .smp_read_mpc_oem = default_smp_read_mpc_oem, .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, }, }; -- cgit v1.2.3-70-g09d2 From d9112f43021554ded2ef2b9bea5f88ba4b52abe0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:41:38 +0200 Subject: x86: Move pre_intr_init to x86_init_ops Replace the quirk machinery by a x86_init_ops function which defaults to the standard implementation. This is also a preparatory patch for Moorestown support which needs to replace the default init_ISA_irqs as well. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq.h | 2 ++ arch/x86/include/asm/setup.h | 1 - arch/x86/include/asm/x86_init.h | 10 ++++++++++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/irqinit.c | 24 ++---------------------- arch/x86/kernel/visws_quirks.c | 10 +++------- arch/x86/kernel/x86_init.c | 5 +++++ 7 files changed, 22 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index f38481bcd45..8fe2782a253 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -47,4 +47,6 @@ extern unsigned int do_IRQ(struct pt_regs *regs); extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); +extern void init_ISA_irqs(void); + #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 345a2551af9..66a319709d6 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,7 +16,6 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_pre_intr_init)(void); int (*arch_intr_init)(void); int (*arch_trap_init)(void); }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index e0d4729c905..65e3394c77f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -43,6 +43,15 @@ struct x86_init_resources { char *(*memory_setup)(void); }; +/** + * struct x86_init_irqs - platform specific interrupt setup + * @pre_vector_init: init code to run before interrupt vectors + * are set up. + */ +struct x86_init_irqs { + void (*pre_vector_init)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -50,6 +59,7 @@ struct x86_init_resources { struct x86_init_ops { struct x86_init_resources resources; struct x86_init_mpparse mpparse; + struct x86_init_irqs irqs; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 3dd5fd76534..ec8b3113716 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_pre_intr_init = NULL, .arch_intr_init = NULL, .arch_trap_init = NULL, }; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 92b7703d3d5..acdf088c758 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -static void __init init_ISA_irqs(void) +void __init init_ISA_irqs(void) { int i; @@ -213,32 +213,12 @@ static void __init apic_intr_init(void) #endif } -/** - * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors - * - * Description: - * Perform any necessary interrupt initialisation prior to setting up - * the "ordinary" interrupt call gates. For legacy reasons, the ISA - * interrupts should be initialised here if the machine emulates a PC - * in any way. - **/ -static void __init x86_quirk_pre_intr_init(void) -{ -#ifdef CONFIG_X86_32 - if (x86_quirks->arch_pre_intr_init) { - if (x86_quirks->arch_pre_intr_init()) - return; - } -#endif - init_ISA_irqs(); -} - void __init native_init_IRQ(void) { int i; /* Execute any quirks before the call gates are initialised: */ - x86_quirk_pre_intr_init(); + x86_init.irqs.pre_vector_init(); apic_intr_init(); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 31e828118f8..1d6309d70df 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -73,14 +73,10 @@ static int __init visws_time_init(void) return 0; } -static int __init visws_pre_intr_init(void) +/* Replaces the default init_ISA_irqs in the generic setup */ +static void __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); - - /* - * We dont want ISA irqs to be set up by the generic code: - */ - return 1; } /* Quirk for machine specific memory setup. */ @@ -232,7 +228,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_pre_intr_init = visws_pre_intr_init, .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, }; @@ -257,6 +252,7 @@ void __init visws_early_detect(void) x86_init.resources.memory_setup = visws_memory_setup; x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; + x86_init.irqs.pre_vector_init = visws_pre_intr_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3488fb62ac0..f2abe2136da 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -9,6 +9,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -35,4 +36,8 @@ struct __initdata x86_init_ops x86_init = { .find_smp_config = default_find_smp_config, .get_smp_config = default_get_smp_config, }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + }, }; -- cgit v1.2.3-70-g09d2 From 66bcaf0bde100a4b54b82fc6fea6ceee2212ffb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 09:59:09 +0200 Subject: x86: Move irq_init to x86_init_ops irq_init is overridden by x86_quirks and by paravirts. Unify the whole mess and make it an unconditional x86_init_ops function which defaults to the standard function and can be overridden by the early platform code. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/irq.h | 1 - arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/irqinit.c | 12 ++++-------- arch/x86/kernel/paravirt.c | 6 ------ arch/x86/kernel/setup.c | 17 ----------------- arch/x86/kernel/visws_quirks.c | 1 - arch/x86/kernel/x86_init.c | 1 + arch/x86/lguest/boot.c | 2 +- arch/x86/xen/irq.c | 5 +++-- 12 files changed, 11 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 8fe2782a253..ddda6cbed6f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -37,7 +37,6 @@ extern void fixup_irqs(void); #endif extern void (*generic_interrupt_extension)(void); -extern void init_IRQ(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 6d668968b6b..25922afb634 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -201,8 +201,6 @@ struct pv_cpu_ops { }; struct pv_irq_ops { - void (*init_IRQ)(void); - /* * Get/set interrupt state. save_fl and restore_fl are only * expected to use X86_EFLAGS_IF; all other bits diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 66a319709d6..404086f9411 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,12 +16,9 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_intr_init)(void); int (*arch_trap_init)(void); }; -extern void x86_quirk_intr_init(void); - extern void x86_quirk_trap_init(void); extern void x86_quirk_pre_time_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 65e3394c77f..8d7be65ccf7 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -47,9 +47,11 @@ struct x86_init_resources { * struct x86_init_irqs - platform specific interrupt setup * @pre_vector_init: init code to run before interrupt vectors * are set up. + * @intr_init: interrupt init code */ struct x86_init_irqs { void (*pre_vector_init)(void); + void (*intr_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index ec8b3113716..eafd341e42d 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_intr_init = NULL, .arch_trap_init = NULL, }; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index acdf088c758..e0142cda239 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -140,8 +140,10 @@ void __init init_ISA_irqs(void) } } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); +void init_IRQ(void) +{ + x86_init.irqs.intr_init(); +} static void __init smp_intr_init(void) { @@ -237,12 +239,6 @@ void __init native_init_IRQ(void) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 - /* - * Call quirks after call gates are initialised (usually add in - * the architecture specific gates): - */ - x86_quirk_intr_init(); - /* * External FPU? Set up irq13 if so, for * original braindamaged IBM FERR coupling. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 532c9a2626c..d76bfbec71a 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -183,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len, return insn_len; } -void init_IRQ(void) -{ - pv_irq_ops.init_IRQ(); -} - static void native_flush_tlb(void) { __native_flush_tlb(); @@ -328,7 +323,6 @@ struct pv_time_ops pv_time_ops = { }; struct pv_irq_ops pv_irq_ops = { - .init_IRQ = native_init_IRQ, .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 54043cb7ba6..d3da0f7333f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1020,23 +1020,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_intr_init - post gate setup interrupt initialisation - * - * Description: - * Fill in any interrupts that may have been left out by the general - * init_IRQ() routine. interrupts having to do with the machine rather - * than the devices on the I/O bus (like APIC interrupts in intel MP - * systems) are started here. - **/ -void __init x86_quirk_intr_init(void) -{ - if (x86_quirks->arch_intr_init) { - if (x86_quirks->arch_intr_init()) - return; - } -} - /** * x86_quirk_trap_init - initialise system specific traps * diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 1d6309d70df..a49013716da 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -228,7 +228,6 @@ static int visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_intr_init = NULL, .arch_trap_init = visws_trap_init, }; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index f2abe2136da..8cb59332e3b 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -39,5 +39,6 @@ struct __initdata x86_init_ops x86_init = { .irqs = { .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 11445c176de..1ff986511f1 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1262,7 +1262,6 @@ __init void lguest_init(void) */ /* Interrupt-related operations */ - pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); @@ -1325,6 +1324,7 @@ __init void lguest_init(void) pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; + x86_init.irqs.intr_init = lguest_init_IRQ; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index cfd17799bd6..9d30105a0c4 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -1,5 +1,7 @@ #include +#include + #include #include #include @@ -112,8 +114,6 @@ static void xen_halt(void) } static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, - .save_fl = PV_CALLEE_SAVE(xen_save_fl), .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), @@ -129,4 +129,5 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { void __init xen_init_irq_ops() { pv_irq_ops = xen_irq_ops; + x86_init.irqs.intr_init = xen_init_IRQ; } -- cgit v1.2.3-70-g09d2 From 428cf9025b15573e16e658032f2b963283e34ae0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 10:35:46 +0200 Subject: x86: Move traps_init to x86_init_ops Replace the quirks by a simple x86_init_ops function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/setup.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/numaq_32.c | 1 - arch/x86/kernel/setup.c | 15 --------------- arch/x86/kernel/traps.c | 5 ++--- arch/x86/kernel/visws_quirks.c | 8 +++----- arch/x86/kernel/x86_init.c | 1 + 7 files changed, 8 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 404086f9411..7751d1f92bc 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,11 +16,8 @@ struct x86_quirks { int (*arch_pre_time_init)(void); int (*arch_time_init)(void); - int (*arch_trap_init)(void); }; -extern void x86_quirk_trap_init(void); - extern void x86_quirk_pre_time_init(void); extern void x86_quirk_time_init(void); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 8d7be65ccf7..07c37bd879f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -48,10 +48,12 @@ struct x86_init_resources { * @pre_vector_init: init code to run before interrupt vectors * are set up. * @intr_init: interrupt init code + * @trap_init: platform specific trap setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); + void (*trap_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index eafd341e42d..71c5ea64586 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -265,7 +265,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, - .arch_trap_init = NULL, }; static __init void early_check_numaq(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d3da0f7333f..bf3b87f1f7d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1020,21 +1020,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -/** - * x86_quirk_trap_init - initialise system specific traps - * - * Description: - * Called as the final act of trap_init(). Used in VISWS to initialise - * the various board specific APIC traps. - **/ -void __init x86_quirk_trap_init(void) -{ - if (x86_quirks->arch_trap_init) { - if (x86_quirks->arch_trap_init()) - return; - } -} - static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7e4b1f5dec8..ed96ed53f69 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -59,6 +59,7 @@ #include #ifdef CONFIG_X86_64 +#include #include #include #else @@ -980,7 +981,5 @@ void __init trap_init(void) */ cpu_init(); -#ifdef CONFIG_X86_32 - x86_quirk_trap_init(); -#endif + x86_init.irqs.trap_init(); } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index a49013716da..2719091b335 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -224,11 +224,10 @@ static void __init visws_find_smp_config(unsigned int reserve) mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; } -static int visws_trap_init(void); +static void visws_trap_init(void); static struct x86_quirks visws_x86_quirks __initdata = { .arch_time_init = visws_time_init, - .arch_trap_init = visws_trap_init, }; void __init visws_early_detect(void) @@ -252,6 +251,7 @@ void __init visws_early_detect(void) x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; x86_init.irqs.pre_vector_init = visws_pre_intr_init; + x86_init.irqs.trap_init = visws_trap_init; /* * Install reboot quirks: @@ -390,12 +390,10 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -static int __init visws_trap_init(void) +static void __init visws_trap_init(void) { lithium_init(); cobalt_init(); - - return 1; } /* diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 8cb59332e3b..9f2b775dc72 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -40,5 +40,6 @@ struct __initdata x86_init_ops x86_init = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, }, }; -- cgit v1.2.3-70-g09d2 From 42bbdb43b16d233b2dacb4cd76e28f61c2a86dc6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:04:10 +0200 Subject: x86: Replace ARCH_SETUP by a proper x86_init_ops ARCH_SETUP is a horrible leftover from the old arch/i386 mach support code. It still has a lonely user in xen. Move it to x86_init_ops. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 6 +----- arch/x86/kernel/x86_init.c | 4 ++++ arch/x86/xen/enlighten.c | 2 +- 7 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6a07af432c8..22cb3872f6d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -24,7 +24,6 @@ static inline void load_sp0(struct tss_struct *tss, PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); } -#define ARCH_SETUP pv_init_ops.arch_setup(); static inline unsigned long get_wallclock(void) { return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 25922afb634..a05085e5fdb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -80,7 +80,6 @@ struct pv_init_ops { unsigned long addr, unsigned len); /* Basic arch-specific setup */ - void (*arch_setup)(void); void (*post_allocator_init)(void); /* Print a banner to identify the environment */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 07c37bd879f..ceffbf358fc 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -56,6 +56,14 @@ struct x86_init_irqs { void (*trap_init)(void); }; +/** + * struct x86_init_oem - oem platform specific customizing functions + * @arch_setup: platform specific architecure setup + */ +struct x86_init_oem { + void (*arch_setup)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -64,6 +72,7 @@ struct x86_init_ops { struct x86_init_resources resources; struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; + struct x86_init_oem oem; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d76bfbec71a..80275ef1651 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -311,7 +311,6 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, .banner = default_banner, - .arch_setup = paravirt_nop, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bf3b87f1f7d..d12aa82c9c3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -108,10 +108,6 @@ #include #endif -#ifndef ARCH_SETUP -#define ARCH_SETUP -#endif - /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. * The direct mapping extends to max_pfn_mapped, so that we can directly access @@ -750,7 +746,7 @@ void __init setup_arch(char **cmdline_p) } #endif - ARCH_SETUP + x86_init.oem.arch_setup(); setup_memory_map(); parse_setup_data(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f2b775dc72..fa2d849be35 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -42,4 +42,8 @@ struct __initdata x86_init_ops x86_init = { .intr_init = native_init_IRQ, .trap_init = x86_init_noop, }, + + .oem = { + .arch_setup = x86_init_noop, + }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 50b20c64f0b..73c7b1d610f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,7 +841,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, .banner = xen_banner, - .arch_setup = xen_arch_setup, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void) pv_mmu_ops = xen_mmu_ops; x86_init.resources.memory_setup = xen_memory_setup; + x86_init.oem.arch_setup = xen_arch_setup; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3-70-g09d2 From 6f30c1ac3fcf11e08f00670f293546a112cdf4e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:19:57 +0200 Subject: x86: Move paravirt banner printout to x86_init_ops Replace another obscure paravirt magic and move it to x86_init_ops. Such a hook is also useful for embedded and special hardware. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 6 +++++- arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/paravirt.c | 10 +--------- arch/x86/kernel/setup.c | 1 + arch/x86/kernel/x86_init.c | 2 ++ arch/x86/xen/enlighten.c | 2 +- 7 files changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 22cb3872f6d..3de6435a106 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -918,6 +918,8 @@ static inline unsigned long __raw_local_irq_save(void) #undef PVOP_VCALL4 #undef PVOP_CALL4 +extern void default_banner(void); + #else /* __ASSEMBLY__ */ #define _PVSITE(ptype, clobbers, ops, word, algn) \ @@ -1058,5 +1060,7 @@ static inline unsigned long __raw_local_irq_save(void) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_PARAVIRT */ +#else /* CONFIG_PARAVIRT */ +# define default_banner x86_init_noop +#endif /* !CONFIG_PARAVIRT */ #endif /* _ASM_X86_PARAVIRT_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a05085e5fdb..ce7723c81a1 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -81,9 +81,6 @@ struct pv_init_ops { /* Basic arch-specific setup */ void (*post_allocator_init)(void); - - /* Print a banner to identify the environment */ - void (*banner)(void); }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ceffbf358fc..ee7c59df781 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -59,9 +59,11 @@ struct x86_init_irqs { /** * struct x86_init_oem - oem platform specific customizing functions * @arch_setup: platform specific architecure setup + * @banner: print a platform specific banner */ struct x86_init_oem { void (*arch_setup)(void); + void (*banner)(void); }; /** diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 80275ef1651..f7a5fb79d18 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -54,7 +54,7 @@ u64 _paravirt_ident_64(u64 x) return x; } -static void __init default_banner(void) +void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); @@ -208,13 +208,6 @@ extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); -static int __init print_banner(void) -{ - pv_init_ops.banner(); - return 0; -} -core_initcall(print_banner); - static struct resource reserve_ioports = { .start = 0, .end = IO_SPACE_LIMIT, @@ -310,7 +303,6 @@ struct pv_info pv_info = { struct pv_init_ops pv_init_ops = { .patch = native_patch, - .banner = default_banner, }; struct pv_time_ops pv_time_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d12aa82c9c3..bc5f0e561cf 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1012,6 +1012,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + x86_init.oem.banner(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index fa2d849be35..08fea49d59a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -45,5 +46,6 @@ struct __initdata x86_init_ops x86_init = { .oem = { .arch_setup = x86_init_noop, + .banner = default_banner, }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 73c7b1d610f..46e23cde143 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -840,7 +840,6 @@ static const struct pv_info xen_info __initdata = { static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, - .banner = xen_banner, .post_allocator_init = xen_post_allocator_init, }; @@ -982,6 +981,7 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; + x86_init.oem.banner = xen_banner; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3-70-g09d2 From 030cb6c00d242c20e92a3327d0cac17ce02d0cc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 14:30:02 +0200 Subject: x86: Move paravirt pagetable_setup to x86_init_ops Replace more paravirt hackery by proper x86_init_ops. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 10 ---------- arch/x86/include/asm/paravirt_types.h | 9 --------- arch/x86/include/asm/pgtable.h | 10 ---------- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/include/asm/x86_init.h | 13 +++++++++++++ arch/x86/kernel/paravirt.c | 7 ------- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/x86_init.c | 6 ++++++ arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 11 +++++++---- arch/x86/xen/mmu.h | 2 +- 11 files changed, 32 insertions(+), 46 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 3de6435a106..1caf25b91e6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -351,16 +351,6 @@ static inline void paravirt_post_allocator_init(void) (*pv_init_ops.post_allocator_init)(); } -static inline void paravirt_pagetable_setup_start(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_start)(base); -} - -static inline void paravirt_pagetable_setup_done(pgd_t *base) -{ - (*pv_mmu_ops.pagetable_setup_done)(base); -} - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index ce7723c81a1..4039eefd3eb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -231,15 +231,6 @@ struct pv_apic_ops { }; struct pv_mmu_ops { - /* - * Called before/after init_mm pagetable setup. setup_start - * may reset %cr3, and may pre-install parts of the pagetable; - * pagetable setup is expected to preserve any existing - * mapping. - */ - void (*pagetable_setup_start)(pgd_t *pgd_base); - void (*pagetable_setup_done)(pgd_t *pgd_base); - unsigned long (*read_cr2)(void); void (*write_cr2)(unsigned long); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 16748077559..60d422adf70 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -56,16 +56,6 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - native_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - native_pagetable_setup_done(base); -} - #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 54cb697f490..7b467bf3c68 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,8 +299,8 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); #else -static inline void native_pagetable_setup_start(pgd_t *base) {} -static inline void native_pagetable_setup_done(pgd_t *base) {} +#define native_pagetable_setup_start x86_init_pgd_noop +#define native_pagetable_setup_done x86_init_pgd_noop #endif struct seq_file; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ee7c59df781..b9bb4faefc4 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PLATFORM_H #define _ASM_X86_PLATFORM_H +#include + struct mpc_bus; struct mpc_cpu; struct mpc_table; @@ -66,6 +68,16 @@ struct x86_init_oem { void (*banner)(void); }; +/** + * struct x86_init_paging - platform specific paging functions + * @pagetable_setup_start: platform specific pre paging_init() call + * @pagetable_setup_done: platform specific post paging_init() call + */ +struct x86_init_paging { + void (*pagetable_setup_start)(pgd_t *base); + void (*pagetable_setup_done)(pgd_t *base); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -75,6 +87,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_paging paging; }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f7a5fb79d18..8167be0b68c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -402,13 +402,6 @@ struct pv_apic_ops pv_apic_ops = { #endif struct pv_mmu_ops pv_mmu_ops = { -#ifndef CONFIG_X86_64 - .pagetable_setup_start = native_pagetable_setup_start, - .pagetable_setup_done = native_pagetable_setup_done, -#else - .pagetable_setup_start = paravirt_nop, - .pagetable_setup_done = paravirt_nop, -#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bc5f0e561cf..4952d63dd67 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -959,9 +959,9 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif - paravirt_pagetable_setup_start(swapper_pg_dir); + x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); - paravirt_pagetable_setup_done(swapper_pg_dir); + x86_init.paging.pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 08fea49d59a..7df020e6740 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -14,6 +14,7 @@ void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } +void __init x86_init_pgd_noop(pgd_t *unused) { } /* * The platform setup functions are preset with the default functions @@ -48,4 +49,9 @@ struct __initdata x86_init_ops x86_init = { .arch_setup = x86_init_noop, .banner = default_banner, }, + + .paging = { + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + }, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 46e23cde143..12ea09ec39b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -977,7 +977,6 @@ asmlinkage void __init xen_start_kernel(void) pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; - pv_mmu_ops = xen_mmu_ops; x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; @@ -991,6 +990,7 @@ asmlinkage void __init xen_start_kernel(void) load_percpu_segment(0); #endif + xen_init_mmu_ops(); xen_init_irq_ops(); xen_init_cpuid_mask(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4ceb2858165..dbec51da930 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1875,10 +1875,7 @@ static void xen_leave_lazy_mmu(void) preempt_enable(); } -const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - +static const struct pv_mmu_ops xen_mmu_ops __initdata = { .read_cr2 = xen_read_cr2, .write_cr2 = xen_write_cr2, @@ -1954,6 +1951,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; +void __init xen_init_mmu_ops(void) +{ + x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; + x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; + pv_mmu_ops = xen_mmu_ops; +} #ifdef CONFIG_XEN_DEBUG_FS diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index da730262489..5fe6bc7f5ec 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -59,5 +59,5 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, unsigned long xen_read_cr2_direct(void); -extern const struct pv_mmu_ops xen_mmu_ops; +extern void xen_init_mmu_ops(void); #endif /* _XEN_MMU_H */ -- cgit v1.2.3-70-g09d2 From f1d7062a235d057e5d85ed2860bef609e0160cde Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 13:13:52 +0200 Subject: x86: Move xen_post_allocator_init into xen_pagetable_setup_done We really do not need two paravirt/x86_init_ops functions which are called in two consecutive source lines. Move the only user of post_allocator_init into the already existing pagetable_setup_done function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 6 ------ arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/setup.h | 4 ---- arch/x86/kernel/setup.c | 1 - arch/x86/xen/enlighten.c | 2 -- arch/x86/xen/mmu.c | 5 ++++- arch/x86/xen/xen-ops.h | 2 -- 7 files changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1caf25b91e6..7ce415e844b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -345,12 +345,6 @@ static inline void setup_secondary_clock(void) } #endif -static inline void paravirt_post_allocator_init(void) -{ - if (pv_init_ops.post_allocator_init) - (*pv_init_ops.post_allocator_init)(); -} - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 4039eefd3eb..ecc74e5ad40 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -78,9 +78,6 @@ struct pv_init_ops { */ unsigned (*patch)(u8 type, u16 clobber, void *insnbuf, unsigned long addr, unsigned len); - - /* Basic arch-specific setup */ - void (*post_allocator_init)(void); }; diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 7751d1f92bc..58b58952b80 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -63,10 +63,6 @@ static inline int is_visws_box(void) { return 0; } extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; -#ifndef CONFIG_PARAVIRT -#define paravirt_post_allocator_init() do {} while (0) -#endif - extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4952d63dd67..43ec6aa175b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -962,7 +962,6 @@ void __init setup_arch(char **cmdline_p) x86_init.paging.pagetable_setup_start(swapper_pg_dir); paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); - paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12ea09ec39b..a924caa168d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -839,8 +839,6 @@ static const struct pv_info xen_info __initdata = { static const struct pv_init_ops xen_init_ops __initdata = { .patch = xen_patch, - - .post_allocator_init = xen_post_allocator_init, }; static const struct pv_time_ops xen_time_ops __initdata = { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dbec51da930..093dd59b538 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1229,9 +1229,12 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static void xen_post_allocator_init(void); + static __init void xen_pagetable_setup_done(pgd_t *base) { xen_setup_shared_info(); + xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) @@ -1841,7 +1844,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -__init void xen_post_allocator_init(void) +static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 22494fd4c9b..355fa6b99c9 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,8 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_post_allocator_init(void); - char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); -- cgit v1.2.3-70-g09d2 From 736decac643e8982655e22ac7f0e5e61c5b7f9bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 12:35:53 +0200 Subject: x86: Move percpu clockevents setup to x86_init_ops paravirt overrides the setup of the default apic timers as per cpu timers. Moorestown needs to override that as well. Move it to x86_init_ops setup and create a separate x86_cpuinit struct which holds the function for the secondary evtl. hotplugabble CPUs. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 5 ++--- arch/x86/include/asm/paravirt.h | 12 ------------ arch/x86/include/asm/paravirt_types.h | 3 --- arch/x86/include/asm/x86_init.h | 19 +++++++++++++++++++ arch/x86/kernel/apic/apic.c | 3 ++- arch/x86/kernel/kvmclock.c | 5 ++++- arch/x86/kernel/paravirt.c | 2 -- arch/x86/kernel/smpboot.c | 4 ++-- arch/x86/kernel/vmi_32.c | 4 ++-- arch/x86/kernel/x86_init.c | 9 +++++++++ arch/x86/xen/enlighten.c | 4 ++-- 11 files changed, 42 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index bb7d4792584..6f15b29005a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -70,9 +70,6 @@ static inline void default_inquire_remote_apic(int apicid) */ #ifdef CONFIG_PARAVIRT #include -#else -#define setup_boot_clock setup_boot_APIC_clock -#define setup_secondary_clock setup_secondary_APIC_clock #endif #ifdef CONFIG_X86_64 @@ -245,6 +242,8 @@ static inline void lapic_shutdown(void) { } static inline void init_apic_mappings(void) { } static inline void disable_local_APIC(void) { } static inline void apic_disable(void) { } +# define setup_boot_APIC_clock x86_init_noop +# define setup_secondary_APIC_clock x86_init_noop #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7ce415e844b..825674a968d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -333,18 +333,6 @@ static inline void slow_down_io(void) #endif } -#ifdef CONFIG_X86_LOCAL_APIC -static inline void setup_boot_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_boot_clock); -} - -static inline void setup_secondary_clock(void) -{ - PVOP_VCALL0(pv_apic_ops.setup_secondary_clock); -} -#endif - #ifdef CONFIG_SMP static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip, unsigned long start_esp) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index ecc74e5ad40..1da89276d14 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -218,9 +218,6 @@ struct pv_irq_ops { struct pv_apic_ops { #ifdef CONFIG_X86_LOCAL_APIC - void (*setup_boot_clock)(void); - void (*setup_secondary_clock)(void); - void (*startup_ipi_hook)(int phys_apicid, unsigned long start_eip, unsigned long start_esp); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b9bb4faefc4..b7d258f4c40 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -78,6 +78,15 @@ struct x86_init_paging { void (*pagetable_setup_done)(pgd_t *base); }; +/** + * struct x86_init_timers - platform specific timer setup + * @setup_perpcu_clockev: set up the per cpu clock event device for the + * boot cpu + */ +struct x86_init_timers { + void (*setup_percpu_clockev)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -88,9 +97,19 @@ struct x86_init_ops { struct x86_init_irqs irqs; struct x86_init_oem oem; struct x86_init_paging paging; + struct x86_init_timers timers; +}; + +/** + * struct x86_cpuinit_ops - platform specific cpu hotplug setups + * @setup_percpu_clockev: set up the per cpu clock event device + */ +struct x86_cpuinit_ops { + void (*setup_percpu_clockev)(void); }; extern struct x86_init_ops x86_init; +extern struct x86_cpuinit_ops x86_cpuinit; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0a1c2830ec6..ce0098066e9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -1701,7 +1702,7 @@ int __init APIC_init_uniprocessor(void) localise_nmi_watchdog(); #endif - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); #ifdef CONFIG_X86_64 check_nmi_watchdog(); #endif diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 223af43f152..64e9b5f59d2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -22,6 +22,8 @@ #include #include #include + +#include #include #define KVM_SCALE 22 @@ -187,7 +189,8 @@ void __init kvmclock_init(void) pv_time_ops.sched_clock = kvm_clock_read; pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8167be0b68c..1ed32c79679 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -387,8 +387,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = setup_boot_APIC_clock, - .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, #endif }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee6..6eb81a87b4b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -323,7 +323,7 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - setup_secondary_clock(); + x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_idle(); @@ -1112,7 +1112,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); - setup_boot_clock(); + x86_init.timers.setup_percpu_clockev(); if (is_uv_system()) uv_system_init(); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 95a7289e4b0..b43b6685cae 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -821,8 +821,8 @@ static inline int __init activate_vmi(void) pv_time_ops.get_wallclock = vmi_get_wallclock; pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; - pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; + x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init; + x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; pv_time_ops.get_tsc_khz = vmi_tsc_khz; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 7df020e6740..e666a98db7c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -54,4 +55,12 @@ struct __initdata x86_init_ops x86_init = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + }, +}; + +__cpuinitdata struct x86_cpuinit_ops x86_cpuinit = { + .setup_percpu_clockev = setup_secondary_APIC_clock, }; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a924caa168d..14e597e0c16 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,8 +912,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC - .setup_boot_clock = paravirt_nop, - .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, #endif }; @@ -979,6 +977,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + x86_init.timers.setup_percpu_clockev = x86_init_noop; + x86_cpuinit.setup_percpu_clockev = x86_init_noop; #ifdef CONFIG_X86_64 /* -- cgit v1.2.3-70-g09d2 From 845b3944bbdf9e9247849bf037f27ff3a3f26d87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Aug 2009 15:37:03 +0200 Subject: x86: Add timer_init to x86_init_ops The timer init code is convoluted with several quirks and the paravirt timer chooser. Figuring out which code path is actually taken is not for the faint hearted. Move the numaq TSC quirk to tsc_pre_init x86_init_ops function and replace the paravirt time chooser and the remaining x86 quirk with a simple x86_init_ops function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 5 ---- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/include/asm/setup.h | 21 ++--------------- arch/x86/include/asm/time.h | 1 - arch/x86/include/asm/timer.h | 3 +-- arch/x86/include/asm/x86_init.h | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 10 ++------ arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 43 ----------------------------------- arch/x86/kernel/time_32.c | 34 ++++++++++++++++++--------- arch/x86/kernel/time_64.c | 9 ++++++-- arch/x86/kernel/tsc.c | 2 ++ arch/x86/kernel/visws_quirks.c | 20 ++++------------ arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/x86_init.c | 3 +++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 4 ++-- 17 files changed, 53 insertions(+), 113 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 825674a968d..11a4ba7b209 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -34,11 +34,6 @@ static inline int set_wallclock(unsigned long nowtime) return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime); } -static inline void (*choose_time_init(void))(void) -{ - return pv_time_ops.time_init; -} - /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 1da89276d14..0d812e592e3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -88,8 +88,6 @@ struct pv_lazy_ops { }; struct pv_time_ops { - void (*time_init)(void); - /* Set and set time of day */ unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 58b58952b80..861e1fe2303 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -5,24 +5,6 @@ #define COMMAND_LINE_SIZE 2048 -#ifndef __ASSEMBLY__ - -#include - -/* - * Any setup quirks to be performed? - */ - -struct x86_quirks { - int (*arch_pre_time_init)(void); - int (*arch_time_init)(void); -}; - -extern void x86_quirk_pre_time_init(void); -extern void x86_quirk_time_init(void); - -#endif /* __ASSEMBLY__ */ - #ifdef __i386__ #include @@ -42,6 +24,7 @@ extern void x86_quirk_time_init(void); #ifndef __ASSEMBLY__ #include +#include /* Interrupt control for vSMPowered x86_64 systems */ #ifdef CONFIG_X86_64 @@ -60,11 +43,11 @@ static inline void visws_early_detect(void) { } static inline int is_visws_box(void) { return 0; } #endif -extern struct x86_quirks *x86_quirks; extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); +extern void setup_default_timer_irq(void); #ifndef _SETUP diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 50c733aac42..91bb162b5a3 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -54,7 +54,6 @@ extern void time_init(void); #define get_wallclock() native_get_wallclock() #define set_wallclock(x) native_set_wallclock(x) -#define choose_time_init() hpet_time_init #endif /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 20ca9c4d468..e854c7ab416 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -12,8 +12,7 @@ unsigned long native_calibrate_tsc(void); #ifdef CONFIG_X86_32 extern int timer_ack; -extern irqreturn_t timer_interrupt(int irq, void *dev_id); -#endif /* CONFIG_X86_32 */ +#endif extern int recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index b7d258f4c40..f8bdd2271a0 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -82,9 +82,13 @@ struct x86_init_paging { * struct x86_init_timers - platform specific timer setup * @setup_perpcu_clockev: set up the per cpu clock event device for the * boot cpu + * @tsc_pre_init: platform function called before TSC init + * @timer_init: initialize the platform timer (default PIT/HPET) */ struct x86_init_timers { void (*setup_percpu_clockev)(void); + void (*tsc_pre_init)(void); + void (*timer_init)(void); }; /** diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 71c5ea64586..f1ebed6bd15 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -129,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void) } } -static int __init numaq_pre_time_init(void) +static void __init numaq_tsc_init(void) { numaq_tsc_disable(); - return 0; } static inline int generate_logical_apicid(int quad, int phys_apicid) @@ -262,11 +261,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) } } -static struct x86_quirks numaq_x86_quirks __initdata = { - .arch_pre_time_init = numaq_pre_time_init, - .arch_time_init = NULL, -}; - static __init void early_check_numaq(void) { /* @@ -281,13 +275,13 @@ static __init void early_check_numaq(void) early_get_smp_config(); if (found_numaq) { - x86_quirks = &numaq_x86_quirks; x86_init.mpparse.mpc_record = numaq_mpc_record; x86_init.mpparse.setup_ioapic_ids = x86_init_noop; x86_init.mpparse.mpc_apic_id = mpc_apic_id; x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; + x86_init.timers.tsc_pre_init = numaq_tsc_init; } } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ed32c79679..9c0e644a76d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -306,7 +306,6 @@ struct pv_init_ops pv_init_ops = { }; struct pv_time_ops pv_time_ops = { - .time_init = hpet_time_init, .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 43ec6aa175b..bb207a47c63 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -626,10 +626,6 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif -static struct x86_quirks default_x86_quirks __initdata; - -struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; - #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -1016,45 +1012,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -/** - * x86_quirk_pre_time_init - do any specific initialisations before. - * - **/ -void __init x86_quirk_pre_time_init(void) -{ - if (x86_quirks->arch_pre_time_init) - x86_quirks->arch_pre_time_init(); -} - -/** - * x86_quirk_time_init - do any specific initialisations for the system timer. - * - * Description: - * Must plug the system timer interrupt source at HZ into the IRQ listed - * in irq_vectors.h:TIMER_IRQ - **/ -void __init x86_quirk_time_init(void) -{ - if (x86_quirks->arch_time_init) { - /* - * A nonzero return code does not mean failure, it means - * that the architecture quirk does not want any - * generic (timer) setup to be performed after this: - */ - if (x86_quirks->arch_time_init()) - return; - } - - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - static struct resource video_ram_resource = { .name = "Video RAM area", .start = 0xa0000, diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 5c5d87f0b2e..89bbb52218b 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -72,7 +72,7 @@ EXPORT_SYMBOL(profile_pc); * Time Stamp Counter value at the time of the timer interrupt, so that * we later on can estimate the time of day more exactly. */ -irqreturn_t timer_interrupt(int irq, void *dev_id) +static irqreturn_t timer_interrupt(int irq, void *dev_id) { /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); @@ -113,25 +113,37 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* Duplicate of time_init() below, with hpet_enable part added */ +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); - x86_quirk_time_init(); + setup_default_timer_irq(); +} + +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); } /* - * This is called directly from init code; we must delay timer setup in the - * HPET case as we can't make the decision to turn on HPET this early in the - * boot process. - * - * The chosen time_init function will usually be hpet_time_init, above, but - * in the case of virtual hardware, an alternative function may be substituted. + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. */ void __init time_init(void) { - x86_quirk_pre_time_init(); tsc_init(); - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 5ba343e6184..38a7df94c10 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -127,9 +128,13 @@ void __init hpet_time_init(void) setup_irq(0, &irq0); } +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); +} + void __init time_init(void) { tsc_init(); - - late_time_init = choose_time_init(); + late_time_init = x86_late_time_init; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357..652bc214eeb 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -857,6 +857,8 @@ void __init tsc_init(void) u64 lpj; int cpu; + x86_init.timers.tsc_pre_init(); + if (!cpu_has_tsc) return; diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 2719091b335..f068553a1b1 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -53,7 +54,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init(void) +static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -66,11 +67,7 @@ static int __init visws_time_init(void) /* Enable (unmask) the timer interrupt */ co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - /* - * Zero return means the generic timer setup code will set up - * the standard vector: - */ - return 0; + setup_default_timer_irq(); } /* Replaces the default init_ISA_irqs in the generic setup */ @@ -226,10 +223,6 @@ static void __init visws_find_smp_config(unsigned int reserve) static void visws_trap_init(void); -static struct x86_quirks visws_x86_quirks __initdata = { - .arch_time_init = visws_time_init, -}; - void __init visws_early_detect(void) { int raw; @@ -241,17 +234,14 @@ void __init visws_early_detect(void) return; /* - * Install special quirks for timer, interrupt and memory setup: - * Fall back to generic behavior for traps: - * Override generic MP-table parsing: + * Override the default platform setup functions */ - x86_quirks = &visws_x86_quirks; - x86_init.resources.memory_setup = visws_memory_setup; x86_init.mpparse.get_smp_config = visws_get_smp_config; x86_init.mpparse.find_smp_config = visws_find_smp_config; x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; + x86_init.timers.timer_init = visws_time_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b43b6685cae..cd7d0fbbf66 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -817,7 +817,7 @@ static inline int __init activate_vmi(void) vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); vmi_timer_ops.cancel_alarm = vmi_get_function(VMI_CALL_CancelAlarm); - pv_time_ops.time_init = vmi_time_init; + x86_init.timers.timer_init = vmi_time_init; pv_time_ops.get_wallclock = vmi_get_wallclock; pv_time_ops.set_wallclock = vmi_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index e666a98db7c..4790b92714a 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -11,6 +11,7 @@ #include #include #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -58,6 +59,8 @@ struct __initdata x86_init_ops x86_init = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, }, }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 1ff986511f1..6caa8c0c793 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; + x86_init.timers.timer_init = lguest_time_init; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 14e597e0c16..84826b842b5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -842,8 +842,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .time_init = xen_time_init, - .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, .get_tsc_khz = xen_tsc_khz, @@ -977,6 +975,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; + + x86_init.timers.timer_init = xen_time_init; x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; -- cgit v1.2.3-70-g09d2 From ecce85089e6d31eed7535b68f5acdd194265690c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:28:50 +0200 Subject: x86: Remove do_timer hook This is a left over of the old x86 sub arch support. Remove it and open code it like we do in time_64.c Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/do_timer.h | 16 ---------------- arch/x86/kernel/time_32.c | 7 ++++--- 2 files changed, 4 insertions(+), 19 deletions(-) delete mode 100644 arch/x86/include/asm/do_timer.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/do_timer.h b/arch/x86/include/asm/do_timer.h deleted file mode 100644 index 23ecda0b28a..00000000000 --- a/arch/x86/include/asm/do_timer.h +++ /dev/null @@ -1,16 +0,0 @@ -/* defines for inline arch setup functions */ -#include - -#include -#include - -/** - * do_timer_interrupt_hook - hook into timer tick - * - * Call the pit clock event handler. see asm/i8253.h - **/ - -static inline void do_timer_interrupt_hook(void) -{ - global_clock_event->event_handler(global_clock_event); -} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 89bbb52218b..6fef4ea1e7a 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -28,6 +28,7 @@ * serialize accesses to xtime/lost_ticks). */ +#include #include #include #include @@ -37,8 +38,8 @@ #include #include #include - -#include +#include +#include int timer_ack; @@ -92,7 +93,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) } #endif - do_timer_interrupt_hook(); + global_clock_event->event_handler(global_clock_event); #ifdef CONFIG_MCA if (MCA_bus) { -- cgit v1.2.3-70-g09d2 From dd3e6e8c6e7a2294f137c4dbccb3e73e7fa8ba15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:35:23 +0200 Subject: x86: Prepare unification of time_32/64.c Unify the top comment and the includes. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 44 +++++++++++++------------------------------- arch/x86/kernel/time_64.c | 13 +++++-------- 2 files changed, 18 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 6fef4ea1e7a..acbaefd61e8 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -1,45 +1,27 @@ /* - * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian , Andrew D. - * Balsa , Philip Gladstone ; - * ported from 2.0.35 Jumbo-9 by Michael Krause ). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). */ #include -#include #include #include #include -#include -#include -#include -#include +#include +#include #include #include +#include +#include +#include +#include int timer_ack; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 38a7df94c10..45914f8844a 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -1,6 +1,4 @@ /* - * "High Precision Event Timer" based timekeeping. - * * Copyright (c) 1991,1992,1995 Linus Torvalds * Copyright (c) 1994 Alan Modra * Copyright (c) 1995 Markus Kuhn @@ -8,23 +6,22 @@ * Copyright (c) 1998 Andrea Arcangeli * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen - * RTC support code taken from arch/i386/kernel/timers/time_hpet.c + * */ #include -#include #include -#include #include #include -#include +#include #include +#include #include +#include #include -#include #include -#include +#include volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -- cgit v1.2.3-70-g09d2 From 64fcbac1f38882d8ae82c44a1c2a676cfa5e79e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:54:21 +0200 Subject: x86: Simplify timer_ack magic in time_32.c Let the compiler optimize the timer_ack magic away in the 32bit timer interrupt and put the same code into time_64.c. It's optimized out for CONFIG_X86_IO_APIC on 32bit and for 64bit because timer_ack is const 0 in both cases. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/timer.h | 6 ++++-- arch/x86/kernel/time_32.c | 5 +++-- arch/x86/kernel/time_64.c | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index e854c7ab416..65228ccc5f0 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -9,11 +9,13 @@ unsigned long long native_sched_clock(void); unsigned long native_calibrate_tsc(void); +extern int recalibrate_cpu_khz(void); -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) extern int timer_ack; +#else +# define timer_ack (0) #endif -extern int recalibrate_cpu_khz(void); extern int no_timer_check; diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index acbaefd61e8..7a26bcf887f 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -23,7 +23,9 @@ #include #include +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; +#endif unsigned long profile_pc(struct pt_regs *regs) { @@ -60,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); -#ifdef CONFIG_X86_IO_APIC + /* Optimized out for !IO_APIC and x86_64 */ if (timer_ack) { /* * Subtle, when I/O APICs are used we have to ack timer IRQ @@ -73,7 +75,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) inb(PIC_MASTER_POLL); spin_unlock(&i8259A_lock); } -#endif global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 45914f8844a..35e0a925da5 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -51,6 +51,20 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) { inc_irq_stat(irq0_irqs); + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + global_clock_event->event_handler(global_clock_event); #ifdef CONFIG_MCA -- cgit v1.2.3-70-g09d2 From 0be6939422eb2f54df4b3d8763c569c6759c1a42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 15:59:35 +0200 Subject: x86: Remove mca bus ifdef from timer interrupt MCA_bus is constant 0 when CONFIG_MCA=n. So the compiler removes that code w/o needing an extra #ifdef Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 18 +++--------------- arch/x86/kernel/time_64.c | 9 +++------ 2 files changed, 6 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 7a26bcf887f..ec729cdcfa3 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -78,21 +78,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); -#ifdef CONFIG_MCA - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ - - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */ - } -#endif + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 35e0a925da5..7db3912b869 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -67,12 +67,9 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); -#ifdef CONFIG_MCA - if (MCA_bus) { - u8 irq_v = inb_p(0x61); /* read the current state */ - outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ - } -#endif + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); return IRQ_HANDLED; } -- cgit v1.2.3-70-g09d2 From 454ede7eebf91b92ab1eafe10c6b6ed04de29bf8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:07:40 +0200 Subject: x86: Make timer setup and global variables the same in time_32/64.c The timer and timer irq setup code is identical in 32 and 64 bit. Make it the same formatting as well. Also add the global variables under the necessary ifdefs to both files. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 8 +++++--- arch/x86/kernel/time_64.c | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ec729cdcfa3..186abc577b2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -27,6 +27,10 @@ int timer_ack; #endif +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -53,9 +57,7 @@ unsigned long profile_pc(struct pt_regs *regs) EXPORT_SYMBOL(profile_pc); /* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. + * Default timer interrupt handler for PIT/HPET */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 7db3912b869..78cbdf5c006 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -23,7 +23,13 @@ #include #include +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif unsigned long profile_pc(struct pt_regs *regs) { @@ -47,8 +53,12 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); +/* + * Default timer interrupt handler for PIT/HPET + */ static irqreturn_t timer_interrupt(int irq, void *dev_id) { + /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); /* Optimized out for !IO_APIC and x86_64 */ @@ -74,8 +84,10 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency */ +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ #define TICK_COUNT 100000000 unsigned long __init calibrate_cpu(void) { @@ -122,18 +134,24 @@ unsigned long __init calibrate_cpu(void) return pmc_now * tsc_khz / (tsc_now - tsc_start); } -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, - .name = "timer" +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" }; +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); - - setup_irq(0, &irq0); + setup_default_timer_irq(); } static void x86_late_time_init(void) @@ -141,6 +159,10 @@ static void x86_late_time_init(void) x86_init.timers.timer_init(); } +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ void __init time_init(void) { tsc_init(); -- cgit v1.2.3-70-g09d2 From 08047c4f1740c7cee75d58e2919d48c09f951649 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:27:41 +0200 Subject: x86: Move calibrate_cpu to tsc.c Move the code where it's only user is. Also we need to look whether this hardwired hackery might interfere with perfcounters. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/time.h | 2 -- arch/x86/kernel/time_32.c | 1 - arch/x86/kernel/time_64.c | 51 ---------------------------------------- arch/x86/kernel/tsc.c | 57 +++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 56 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index 91bb162b5a3..9c5608b21c2 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -57,6 +57,4 @@ extern void time_init(void); #endif /* CONFIG_PARAVIRT */ -extern unsigned long __init calibrate_cpu(void); - #endif /* _ASM_X86_TIME_H */ diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 186abc577b2..fd876cc7748 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 78cbdf5c006..e59a40ebff1 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -21,7 +21,6 @@ #include #include #include -#include #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) int timer_ack; @@ -84,56 +83,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} - static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 652bc214eeb..97a0bcbad10 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -17,6 +17,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -852,6 +853,60 @@ static void __init init_tsc_clocksource(void) clocksource_register(&clocksource_tsc); } +#ifdef CONFIG_X86_64 +/* + * calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency + */ +#define TICK_COUNT 100000000 +static unsigned long __init calibrate_cpu(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " + "cpu_khz value may be incorrect.\n"); + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start measuring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} +#else +static inline unsigned long calibrate_cpu(void) { return cpu_khz; } +#endif + void __init tsc_init(void) { u64 lpj; @@ -870,11 +925,9 @@ void __init tsc_init(void) return; } -#ifdef CONFIG_X86_64 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) cpu_khz = calibrate_cpu(); -#endif printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, -- cgit v1.2.3-70-g09d2 From ef4512882dbe9978e7a18ccbcb4cb45705ce5560 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 21 Aug 2009 13:24:08 +0200 Subject: x86: time_32/64.c unify profile_pc The code is identical except for the formatting and a useless #ifdef. Make it the same. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time_32.c | 13 ++++++------- arch/x86/kernel/time_64.c | 8 +++++--- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index fd876cc7748..fda0c34da75 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -34,23 +34,22 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); -#ifdef CONFIG_SMP if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = (unsigned long *)®s->sp; - - /* Return address is either directly at stack pointer - or above a saved flags. Eflags has bits 22-31 zero, - kernel addresses don't. */ + unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; #endif } -#endif return pc; } EXPORT_SYMBOL(profile_pc); diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e59a40ebff1..fda0c34da75 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -34,14 +34,16 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or a copy - of flags from PUSHF - Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) -- cgit v1.2.3-70-g09d2 From 47926214d8b2bef13b2be57c500194a804f16198 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:47:19 +0200 Subject: x86: Replace the now identical time_32/64.c by time.c Remove the redundant copy. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/time.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/time_32.c | 121 ---------------------------------------------- arch/x86/kernel/time_64.c | 121 ---------------------------------------------- 4 files changed, 122 insertions(+), 243 deletions(-) create mode 100644 arch/x86/kernel/time.c delete mode 100644 arch/x86/kernel/time_32.c delete mode 100644 arch/x86/kernel/time_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 313ed6fca9b..ccf3db607c2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,7 +31,7 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o +obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..fda0c34da75 --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) +int timer_ack; +#endif + +#ifdef CONFIG_X86_64 +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = (unsigned long *)regs->sp; + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + /* Keep nmi watchdog up to date */ + inc_irq_stat(irq0_irqs); + + /* Optimized out for !IO_APIC and x86_64 */ + if (timer_ack) { + /* + * Subtle, when I/O APICs are used we have to ack timer IRQ + * manually to deassert NMI lines for the watchdog if run + * on an 82489DX-based system. + */ + spin_lock(&i8259A_lock); + outb(0x0c, PIC_MASTER_OCW3); + /* Ack the IRQ; AEOI will end it automatically. */ + inb(PIC_MASTER_POLL); + spin_unlock(&i8259A_lock); + } + + global_clock_event->event_handler(global_clock_event); + + /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ + if (MCA_bus) + outb_p(inb_p(0x61)| 0x80, 0x61); + + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + irq0.mask = cpumask_of_cpu(0); + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + tsc_init(); + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c deleted file mode 100644 index fda0c34da75..00000000000 --- a/arch/x86/kernel/time_32.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - -#ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -#endif - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * Default timer interrupt handler for PIT/HPET - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } - - global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - - return IRQ_HANDLED; -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -void __init setup_default_timer_irq(void) -{ - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - -/* Default timer init function */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - setup_default_timer_irq(); -} - -static void x86_late_time_init(void) -{ - x86_init.timers.timer_init(); -} - -/* - * Initialize TSC and delay the periodic timer init to - * late x86_late_time_init() so ioremap works. - */ -void __init time_init(void) -{ - tsc_init(); - late_time_init = x86_late_time_init; -} diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c deleted file mode 100644 index fda0c34da75..00000000000 --- a/arch/x86/kernel/time_64.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 1991,1992,1995 Linus Torvalds - * Copyright (c) 1994 Alan Modra - * Copyright (c) 1995 Markus Kuhn - * Copyright (c) 1996 Ingo Molnar - * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002,2006 Vojtech Pavlik - * Copyright (c) 2003 Andi Kleen - * - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - -#ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -#endif - -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - if (!user_mode_vm(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; -} -EXPORT_SYMBOL(profile_pc); - -/* - * Default timer interrupt handler for PIT/HPET - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - /* Keep nmi watchdog up to date */ - inc_irq_stat(irq0_irqs); - - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } - - global_clock_event->event_handler(global_clock_event); - - /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ - if (MCA_bus) - outb_p(inb_p(0x61)| 0x80, 0x61); - - return IRQ_HANDLED; -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, - .name = "timer" -}; - -void __init setup_default_timer_irq(void) -{ - irq0.mask = cpumask_of_cpu(0); - setup_irq(0, &irq0); -} - -/* Default timer init function */ -void __init hpet_time_init(void) -{ - if (!hpet_enable()) - setup_pit_timer(); - setup_default_timer_irq(); -} - -static void x86_late_time_init(void) -{ - x86_init.timers.timer_init(); -} - -/* - * Initialize TSC and delay the periodic timer init to - * late x86_late_time_init() so ioremap works. - */ -void __init time_init(void) -{ - tsc_init(); - late_time_init = x86_late_time_init; -} -- cgit v1.2.3-70-g09d2 From 2d826404f0bdcac2a4dd7e3c446b70d6a3b63b78 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 17:06:25 +0200 Subject: x86: Move tsc_calibration to x86_init_ops TSC calibration is modified by the vmware hypervisor and paravirt by separate means. Moorestown wants to add its own calibration routine as well. So make calibrate_tsc a proper x86_init_ops function and override it by paravirt or by the early setup of the vmware hypervisor. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/hypervisor.h | 2 +- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/timer.h | 5 ----- arch/x86/include/asm/tsc.h | 3 ++- arch/x86/include/asm/vmware.h | 2 +- arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/cpu/hypervisor.c | 14 +++++++------- arch/x86/kernel/cpu/vmware.c | 21 ++++++++++++--------- arch/x86/kernel/kvmclock.c | 2 +- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/tsc.c | 13 ++++--------- arch/x86/kernel/vmi_32.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/kernel/x86_init.c | 5 +++++ arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 3 ++- 17 files changed, 48 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 369f5c5d09a..b78c0941e42 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__HYPERVISOR_H #define ASM_X86__HYPERVISOR_H -extern unsigned long get_hypervisor_tsc_freq(void); extern void init_hypervisor(struct cpuinfo_x86 *c); +extern void init_hypervisor_platform(void); #endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 11a4ba7b209..1e458a55330 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -210,7 +210,6 @@ static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -#define calibrate_tsc() (pv_time_ops.get_tsc_khz()) static inline unsigned long long paravirt_read_pmc(int counter) { diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 65228ccc5f0..5469630b27f 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -8,7 +8,6 @@ #define TICK_SIZE (tick_nsec / 1000) unsigned long long native_sched_clock(void); -unsigned long native_calibrate_tsc(void); extern int recalibrate_cpu_khz(void); #if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) @@ -19,10 +18,6 @@ extern int timer_ack; extern int no_timer_check; -#ifndef CONFIG_PARAVIRT -#define calibrate_tsc() native_calibrate_tsc() -#endif - /* Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 38ae163cc91..c0427295e8f 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -48,7 +48,8 @@ static __always_inline cycles_t vget_cycles(void) extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); -int check_tsc_unstable(void); +extern int check_tsc_unstable(void); +extern unsigned long native_calibrate_tsc(void); /* * Boot-time check whether the TSCs are synchronized across diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index c11b7e100d8..e49ed6d2fd4 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -20,7 +20,7 @@ #ifndef ASM_X86__VMWARE_H #define ASM_X86__VMWARE_H -extern unsigned long vmware_get_tsc_khz(void); +extern void vmware_platform_setup(void); extern int vmware_platform(void); extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f8bdd2271a0..20df5187171 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -112,8 +112,17 @@ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); }; +/** + * struct x86_platform_ops - platform specific runtime functions + * @calibrate_tsc: calibrate TSC + */ +struct x86_platform_ops { + unsigned long (*calibrate_tsc)(void); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; +extern struct x86_platform_ops x86_platform; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 93ba8eeb100..08be922de33 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c) c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; } -unsigned long get_hypervisor_tsc_freq(void) -{ - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - return vmware_get_tsc_khz(); - return 0; -} - static inline void __cpuinit hypervisor_set_feature_bits(struct cpuinfo_x86 *c) { @@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) detect_hypervisor_vendor(c); hypervisor_set_feature_bits(c); } + +void __init init_hypervisor_platform(void) +{ + init_hypervisor(&boot_cpu_data); + if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) + vmware_platform_setup(); +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index bc24f514ec9..0a46b4df5d8 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,6 +24,7 @@ #include #include #include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -47,21 +48,29 @@ static inline int __vmware_platform(void) return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; } -static unsigned long __vmware_get_tsc_khz(void) +static unsigned long vmware_get_tsc_khz(void) { uint64_t tsc_hz; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); - if (ebx == UINT_MAX) - return 0; tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); return tsc_hz; } +void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; +} + /* * While checking the dmi string infomation, just checking the product * serial key should be enough, as this will always have a VMware @@ -87,12 +96,6 @@ int vmware_platform(void) return 0; } -unsigned long vmware_get_tsc_khz(void) -{ - BUG_ON(!vmware_platform()); - return __vmware_get_tsc_khz(); -} - /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. * Still, due to timing difference when running on virtual cpus, the TSC can diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 64e9b5f59d2..75a21b61b86 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -187,7 +187,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; - pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC x86_cpuinit.setup_percpu_clockev = kvm_setup_secondary_clock; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9c0e644a76d..7cbf898d839 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -309,7 +309,6 @@ struct pv_time_ops pv_time_ops = { .get_wallclock = native_get_wallclock, .set_wallclock = native_set_wallclock, .sched_clock = native_sched_clock, - .get_tsc_khz = native_calibrate_tsc, }; struct pv_irq_ops pv_irq_ops = { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bb207a47c63..2d93026af7c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -818,7 +818,7 @@ void __init setup_arch(char **cmdline_p) * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. */ - init_hypervisor(&boot_cpu_data); + init_hypervisor_platform(); x86_init.resources.probe_roms(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 97a0bcbad10..9917632a8b4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -401,15 +402,9 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - hv_tsc_khz = get_hypervisor_tsc_freq(); - if (hv_tsc_khz) { - printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return hv_tsc_khz; - } - local_irq_save(flags); fast_calibrate = quick_pit_calibrate(); local_irq_restore(flags); @@ -567,7 +562,7 @@ int recalibrate_cpu_khz(void) unsigned long cpu_khz_old = cpu_khz; if (cpu_has_tsc) { - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, @@ -917,7 +912,7 @@ void __init tsc_init(void) if (!cpu_has_tsc) return; - tsc_khz = calibrate_tsc(); + tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; if (!tsc_khz) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index cd7d0fbbf66..052ae81ee08 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -825,7 +825,7 @@ static inline int __init activate_vmi(void) x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init; #endif pv_time_ops.sched_clock = vmi_sched_clock; - pv_time_ops.get_tsc_khz = vmi_tsc_khz; + x86_platform.calibrate_tsc = vmi_tsc_khz; /* We have true wallclock functions; disable CMOS clock sync */ no_sync_cmos_clock = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 2b3eb82efee..611b9e2360d 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void) return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); } -/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ +/* x86_platform.calibrate_tsc = vmi_tsc_khz */ unsigned long vmi_tsc_khz(void) { unsigned long long khz; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 4790b92714a..13081b92191 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -13,6 +13,7 @@ #include #include #include +#include void __cpuinit x86_init_noop(void) { } void __init x86_init_uint_noop(unsigned int unused) { } @@ -67,3 +68,7 @@ struct __initdata x86_init_ops x86_init = { __cpuinitdata struct x86_cpuinit_ops x86_cpuinit = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, +}; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 6caa8c0c793..fabe745513d 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1320,11 +1320,11 @@ __init void lguest_init(void) /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; - pv_time_ops.get_tsc_khz = lguest_tsc_khz; x86_init.resources.memory_setup = lguest_memory_setup; x86_init.irqs.intr_init = lguest_init_IRQ; x86_init.timers.timer_init = lguest_time_init; + x86_platform.calibrate_tsc = lguest_tsc_khz; /* * Now is a good time to look at the implementations of these functions diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 84826b842b5..ee8cac77c8a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -844,7 +844,6 @@ static const struct pv_init_ops xen_init_ops __initdata = { static const struct pv_time_ops xen_time_ops __initdata = { .set_wallclock = xen_set_wallclock, .get_wallclock = xen_get_wallclock, - .get_tsc_khz = xen_tsc_khz, .sched_clock = xen_sched_clock, }; @@ -980,6 +979,8 @@ asmlinkage void __init xen_start_kernel(void) x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_cpuinit.setup_percpu_clockev = x86_init_noop; + x86_platform.calibrate_tsc = xen_tsc_khz; + #ifdef CONFIG_X86_64 /* * Setup percpu state. We only need to do this for 64-bit -- cgit v1.2.3-70-g09d2 From dd0a70c8f921708eba29ef9f30dde1f14a74af05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Aug 2009 16:51:07 +0200 Subject: x86: Move tsc_init to late_time_init We do not need the TSC before late_time_init. Move the tsc_init to the late time init code so we can also utilize HPET for calibration (which we claimed to do but never did except in some older kernel version). This also helps Moorestown to calibrate the TSC with the AHBT timer which needs to be initialized in late_time_init like HPET. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fda0c34da75..fcece00356a 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -108,6 +108,7 @@ void __init hpet_time_init(void) static void x86_late_time_init(void) { x86_init.timers.timer_init(); + tsc_init(); } /* @@ -116,6 +117,5 @@ static void x86_late_time_init(void) */ void __init time_init(void) { - tsc_init(); late_time_init = x86_late_time_init; } -- cgit v1.2.3-70-g09d2 From 47a3d5da70f411bc044ecd3c0593b158b09d0efa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 15:03:59 +0200 Subject: x86: Add early platform detection Platforms like Moorestown require early setup and want to avoid the call to reserve_ebda_region. The x86_init override is too late when the MRST detection happens in setup_arch. Move the default i386 x86_init overrides and the call to reserve_ebda_region into a separate function which is called as the default of a switch case depending on the hardware_subarch id in boot params. This allows us to add a case for MRST and let MRST have its own early setup function. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/x86_init.h | 3 +-- arch/x86/kernel/head32.c | 22 +++++++++++++++++----- arch/x86/kernel/head64.c | 3 ++- arch/x86/kernel/x86_init.c | 1 - 4 files changed, 20 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 20df5187171..b6c89428137 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -2,6 +2,7 @@ #define _ASM_X86_PLATFORM_H #include +#include struct mpc_bus; struct mpc_cpu; @@ -34,14 +35,12 @@ struct x86_init_mpparse { * @probe_roms: probe BIOS roms * @reserve_resources: reserve the standard resources for the * platform - * @reserve_ebda_region: reserve the extended bios data area * @memory_setup: platform specific memory setup * */ struct x86_init_resources { void (*probe_roms)(void); void (*reserve_resources)(void); - void (*reserve_ebda_region)(void); char *(*memory_setup)(void); }; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index a21398fac4f..441c075e2b8 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -15,6 +15,17 @@ #include #include #include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initilize 32bit specific setup functions */ + x86_init.resources.probe_roms = probe_roms; + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} void __init i386_start_kernel(void) { @@ -31,12 +42,13 @@ void __init i386_start_kernel(void) reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif - /* Initilize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; - x86_init.resources.reserve_resources = i386_reserve_resources; - x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - x86_init.resources.reserve_ebda_region(); + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + default: + i386_default_early_setup(); + break; + } /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cead8149c3d..0b06cd778fd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -24,6 +24,7 @@ #include #include #include +#include static void __init zap_identity_mappings(void) { @@ -111,7 +112,7 @@ void __init x86_64_start_reservations(char *real_mode_data) } #endif - x86_init.resources.reserve_ebda_region(); + reserve_ebda_region(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 13081b92191..24be7f39789 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -28,7 +28,6 @@ struct __initdata x86_init_ops x86_init = { .resources = { .probe_roms = x86_init_noop, .reserve_resources = reserve_standard_io_resources, - .reserve_ebda_region = reserve_ebda_region, .memory_setup = default_machine_specific_memory_setup, }, -- cgit v1.2.3-70-g09d2 From 162bc7ab01a00eba1c5d614e64a51e1268ee3f96 Mon Sep 17 00:00:00 2001 From: "Pan, Jacob jun" Date: Fri, 28 Aug 2009 14:52:47 -0700 Subject: x86: Add hardware_subarch ID for Moorestown x86 bootprotocol 2.07 has introduced hardware_subarch ID in the boot parameters provided by FW. We use it to identify Moorestown platforms. [ tglx: Cleanup and paravirt fix ] Signed-off-by: Jacob Pan Signed-off-by: Thomas Gleixner --- Documentation/x86/boot.txt | 1 + arch/x86/include/asm/bootparam.h | 10 ++++++++++ arch/x86/kernel/head_32.S | 1 + 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index 8da3a795083..30b43e1b269 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -599,6 +599,7 @@ Protocol: 2.07+ 0x00000000 The default x86/PC environment 0x00000001 lguest 0x00000002 Xen + 0x00000003 Moorestown MID Field name: hardware_subarch_data Type: write (subarch-dependent) diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317..283a9a1b3ef 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -109,4 +109,14 @@ struct boot_params { __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); +enum { + X86_SUBARCH_PC = 0, + X86_SUBARCH_LGUEST, + X86_SUBARCH_XEN, + X86_SUBARCH_MRST, + X86_NR_SUBARCHS, +}; + + + #endif /* _ASM_X86_BOOTPARAM_H */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index cc827ac9e8d..304e3f3d747 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -157,6 +157,7 @@ subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #endif /* CONFIG_PARAVIRT */ -- cgit v1.2.3-70-g09d2 From 3f4110a48a749a1aa1c54fb807afb3f32f49711c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 14:54:20 +0200 Subject: x86: Add Moorestown early detection Moorestown MID devices need to be detected early in the boot process to setup and do not call x86_default_early_setup as there is no EBDA region to reserve. [ Copied the minimal code from Jacobs latest MRST series ] Signed-off-by: Thomas Gleixner Cc: Jacob Pan --- arch/x86/Kconfig | 13 +++++++++++++ arch/x86/include/asm/setup.h | 6 ++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/head32.c | 3 +++ arch/x86/kernel/mrst.c | 24 ++++++++++++++++++++++++ 5 files changed, 47 insertions(+) create mode 100644 arch/x86/kernel/mrst.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5df37d..586d84557f7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -318,6 +318,7 @@ config X86_EXTENDED_PLATFORM SGI 320/540 (Visual Workstation) Summit/EXA (IBM x440) Unisys ES7000 IA32 series + Moorestown MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -377,6 +378,18 @@ config X86_ELAN If unsure, choose "PC-compatible" instead. +config X86_MRST + bool "Moorestown MID platform" + depends on X86_32 + depends on X86_EXTENDED_PLATFORM + ---help--- + Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin + Internet Device(MID) platform. Moorestown consists of two chips: + Lincroft (CPU core, graphics, and memory controller) and Langwell IOH. + Unlike standard x86 PCs, Moorestown does not have many legacy devices + nor standard legacy replacement devices/features. e.g. Moorestown does + not contain i8259, i8254, HPET, legacy BIOS, most of the io ports. + config X86_RDC321X bool "RDC R-321x SoC" depends on X86_32 diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 861e1fe2303..18e496c98ff 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -49,6 +49,12 @@ extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern void setup_default_timer_irq(void); +#ifdef CONFIG_X86_MRST +extern void x86_mrst_early_setup(void); +#else +static inline void x86_mrst_early_setup(void) { } +#endif + #ifndef _SETUP /* diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ccf3db607c2..5f33316610d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 441c075e2b8..4f8e2507e8f 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -45,6 +45,9 @@ void __init i386_start_kernel(void) /* Call the subarch specific early setup function */ switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; default: i386_default_early_setup(); break; diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c new file mode 100644 index 00000000000..3b7078abc87 --- /dev/null +++ b/arch/x86/kernel/mrst.c @@ -0,0 +1,24 @@ +/* + * mrst.c: Intel Moorestown platform specific setup code + * + * (C) Copyright 2008 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include + +#include + +/* + * Moorestown specific x86_init function overrides and early setup + * calls. + */ +void __init x86_mrst_early_setup(void) +{ + x86_init.resources.probe_roms = x86_init_noop; + x86_init.resources.reserve_resources = x86_init_noop; +} -- cgit v1.2.3-70-g09d2 From bc07844a33734c4b2f32ef26d942d2f3ef9302ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 18:09:57 +0200 Subject: x86: Distangle ioapic and i8259 The proposed Moorestown support patches use an extra feature flag mechanism to make the ioapic work w/o an i8259. There is a much simpler solution. Most i8259 specific functions are already called dependend on the irq number less than NR_IRQS_LEGACY. Replacing that constant by a read_mostly variable which can be set to 0 by the platform setup code allows us to achieve the same without any special feature flags. That trivial change allows us to proceed with MRST w/o doing a full blown overhaul of the ioapic code which would delay MRST unduly. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/io_apic.h | 2 ++ arch/x86/kernel/apic/io_apic.c | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 2b8aeb89933..e1f89a1a07e 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,6 +143,8 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; +extern void io_apic_disable_legacy(void); + /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f4687187ce..6c961290a5f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -91,6 +91,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +/* Number of legacy interrupts */ +static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + #if defined (CONFIG_MCA) || defined (CONFIG_EISA) int mp_bus_id_to_type[MAX_MP_BUSSES]; #endif @@ -172,6 +177,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; +void __init io_apic_disable_legacy(void) +{ + nr_legacy_irqs = 0; + nr_irqs_gsi = 0; +} + int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -189,7 +200,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < NR_IRQS_LEGACY) + if (i < nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -883,7 +894,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1480,7 +1491,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) disable_8259A_irq(irq); ioapic_write_entry(apic_id, pin, entry); @@ -1851,7 +1862,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET) + if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1914,6 +1925,10 @@ void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } + + if (!nr_legacy_irqs) + return; + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ @@ -1968,6 +1983,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); + if (!nr_legacy_irqs) + return; + /* * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode @@ -2198,7 +2216,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < NR_IRQS_LEGACY) { + if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) was_pending = 1; @@ -2709,7 +2727,7 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < NR_IRQS_LEGACY) + if (irq < nr_legacy_irqs) make_8259A_irq(irq); else /* Strange. Oh, well.. */ @@ -3045,7 +3063,7 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) +#define PIC_IRQS (1UL << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { @@ -3053,8 +3071,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - - io_apic_irqs = ~PIC_IRQS; + io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3065,7 +3082,8 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - check_timer(); + if (nr_legacy_irqs) + check_timer(); } /* @@ -3166,7 +3184,6 @@ static int __init ioapic_init_sysfs(void) device_initcall(ioapic_init_sysfs); -static int nr_irqs_gsi = NR_IRQS_LEGACY; /* * Dynamic irq allocate and deallocation */ @@ -3907,7 +3924,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= NR_IRQS_LEGACY) { + if (irq >= nr_legacy_irqs) { cfg = desc->chip_data; add_pin_to_irq_node(cfg, node, ioapic, pin); } -- cgit v1.2.3-70-g09d2 From e11dadabf443dc3101f28b74d8b9d56870a87db4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Aug 2009 15:18:40 +0200 Subject: x86: apic namespace cleanup boot_cpu_physical_apicid is a global variable and used as function argument as well. Rename the function arguments to avoid confusion. Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/apic.h | 14 +++++++------- arch/x86/kernel/apic/bigsmp_32.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apic/summit_32.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 6f15b29005a..d6a0f2636a6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -292,7 +292,7 @@ struct apic { int (*cpu_present_to_apicid)(int mps_cpu); physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); void (*setup_portio_remap)(void); - int (*check_phys_apicid_present)(int boot_cpu_physical_apicid); + int (*check_phys_apicid_present)(int phys_apicid); void (*enable_apic_mode)(void); int (*phys_pkg_id)(int cpuid_apic, int index_msb); @@ -426,7 +426,7 @@ extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif static inline void default_wait_for_init_deassert(atomic_t *deassert) @@ -542,9 +542,9 @@ static inline int __default_cpu_present_to_apicid(int mps_cpu) } static inline int -__default_check_phys_apicid_present(int boot_cpu_physical_apicid) +__default_check_phys_apicid_present(int phys_apicid) { - return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map); + return physid_isset(phys_apicid, phys_cpu_present_map); } #ifdef CONFIG_X86_32 @@ -554,13 +554,13 @@ static inline int default_cpu_present_to_apicid(int mps_cpu) } static inline int -default_check_phys_apicid_present(int boot_cpu_physical_apicid) +default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #else extern int default_cpu_present_to_apicid(int mps_cpu); -extern int default_check_phys_apicid_present(int boot_cpu_physical_apicid); +extern int default_check_phys_apicid_present(int phys_apicid); #endif static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid) diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 676cdac385c..77a06413b6b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) return physids_promote(0xFFL); } -static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int bigsmp_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f1ebed6bd15..efa00e2b850 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -413,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; -static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) +static inline int numaq_check_phys_apicid_present(int phys_apicid) { return 1; } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index eafdfbd1ea9..645ecc4ff0b 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid) return physid_mask_of_physid(0); } -static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) +static int summit_check_phys_apicid_present(int physical_apicid) { return 1; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2d93026af7c..fda22ec1a93 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -129,9 +129,9 @@ int default_cpu_present_to_apicid(int mps_cpu) return __default_cpu_present_to_apicid(mps_cpu); } -int default_check_phys_apicid_present(int boot_cpu_physical_apicid) +int default_check_phys_apicid_present(int phys_apicid) { - return __default_check_phys_apicid_present(boot_cpu_physical_apicid); + return __default_check_phys_apicid_present(phys_apicid); } #endif -- cgit v1.2.3-70-g09d2 From 132ec92f3f70fe365c1f4b8d46e66cf8a2a16880 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 31 Aug 2009 09:50:09 +0200 Subject: x86, msr: Add rd/wrmsr interfaces with preset registers native_{rdmsr,wrmsr}_safe_regs are two new interfaces which allow presetting of a subset of eight x86 GPRs before executing the rd/wrmsr instructions. This is needed at least on AMD K8 for accessing an erratum workaround MSR. Originally based on an idea by H. Peter Anvin. Signed-off-by: Borislav Petkov LKML-Reference: <1251705011-18636-1-git-send-email-petkovbb@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 13 ++++++ arch/x86/include/asm/paravirt.h | 16 +++++++ arch/x86/kernel/paravirt.c | 2 + arch/x86/lib/Makefile | 1 + arch/x86/lib/msr-reg.S | 98 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+) create mode 100644 arch/x86/lib/msr-reg.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 48ad9d29484..184d4a11396 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -113,6 +113,9 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); +extern int native_rdmsr_safe_regs(u32 *regs); +extern int native_wrmsr_safe_regs(u32 *regs); + static __always_inline unsigned long long __native_read_tsc(void) { DECLARE_ARGS(val, low, high); @@ -189,6 +192,16 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } +static inline int rdmsr_safe_regs(u32 *regs) +{ + return native_rdmsr_safe_regs(regs); +} + +static inline int wrmsr_safe_regs(u32 *regs) +{ + return native_wrmsr_safe_regs(regs); +} + #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4fb37c8a083..1705944e037 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -168,7 +168,9 @@ struct pv_cpu_ops { err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr_amd)(unsigned int msr, int *err); u64 (*read_msr)(unsigned int msr, int *err); + int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); + int (*wrmsr_regs)(u32 *regs); u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); @@ -820,6 +822,12 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) { return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } + +static inline int paravirt_rdmsr_regs(u32 *regs) +{ + return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); +} + static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) { return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); @@ -829,6 +837,11 @@ static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } +static inline int paravirt_wrmsr_regs(u32 *regs) +{ + return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); +} + /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -862,6 +875,9 @@ do { \ _err; \ }) +#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) +#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) + static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 70ec9b951d7..67594af43b3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -362,8 +362,10 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .rdmsr_regs = native_rdmsr_safe_regs, .read_msr_amd = native_read_msr_amd_safe, .write_msr = native_write_msr_safe, + .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 07c31899c9c..b59c0647d80 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -8,6 +8,7 @@ lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += msr-reg.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S new file mode 100644 index 00000000000..51f1bb3f8c7 --- /dev/null +++ b/arch/x86/lib/msr-reg.S @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +/* + * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * + * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] + * + */ +.macro op_safe_regs op:req +ENTRY(native_\op\()_safe_regs) + push %rbx + push %rbp + push $0 /* Return value */ + push %rdi + movl (%rdi), %eax + movl 4(%rdi), %ecx + movl 8(%rdi), %edx + movl 12(%rdi), %ebx + movl 20(%rdi), %ebp + movl 24(%rdi), %esi + movl 28(%rdi), %edi +1: \op +2: movl %edi, %r10d + pop %rdi + movl %eax, (%rdi) + movl %ecx, 4(%rdi) + movl %edx, 8(%rdi) + movl %ebx, 12(%rdi) + movl %ebp, 20(%rdi) + movl %esi, 24(%rdi) + movl %r10d, 28(%rdi) + pop %rax + pop %rbp + pop %rbx + ret +3: + movq $-EIO, 8(%rsp) + jmp 2b + .section __ex_table,"ax" + .balign 4 + .quad 1b, 3b + .previous +ENDPROC(native_\op\()_safe_regs) +.endm + +#else /* X86_32 */ + +.macro op_safe_regs op:req +ENTRY(native_\op\()_safe_regs) + push %ebx + push %ebp + push %esi + push %edi + push $0 /* Return value */ + push %eax + movl 4(%eax), %ecx + movl 8(%eax), %edx + movl 12(%eax), %ebx + movl 20(%eax), %ebp + movl 24(%eax), %esi + movl 28(%eax), %edi + movl (%eax), %eax +1: \op +2: push %eax + movl 4(%esp), %eax + pop (%eax) + addl $4, %esp + movl %ecx, 4(%eax) + movl %edx, 8(%eax) + movl %ebx, 12(%eax) + movl %ebp, 20(%eax) + movl %esi, 24(%eax) + movl %edi, 28(%eax) + pop %eax + pop %edi + pop %esi + pop %ebp + pop %ebx + ret +3: + movl $-EIO, 4(%esp) + jmp 2b + .section __ex_table,"ax" + .balign 4 + .long 1b, 3b + .previous +ENDPROC(native_\op\()_safe_regs) +.endm + +#endif + +op_safe_regs rdmsr +op_safe_regs wrmsr + -- cgit v1.2.3-70-g09d2 From 177fed1ee8d727c39601ce9fc2299b4cb25a718e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 31 Aug 2009 09:50:10 +0200 Subject: x86, msr: Rewrite AMD rd/wrmsr variants Switch them to native_{rd,wr}msr_safe_regs and remove pv_cpu_ops.read_msr_amd. Signed-off-by: Borislav Petkov LKML-Reference: <1251705011-18636-2-git-send-email-petkovbb@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr.h | 38 +++++++++++++++++++++----------------- arch/x86/include/asm/paravirt.h | 26 ++++++++++++++++++++------ arch/x86/kernel/paravirt.c | 1 - 3 files changed, 41 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 184d4a11396..09c5ca70d49 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -71,22 +71,6 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, return EAX_EDX_VAL(val, low, high); } -static inline unsigned long long native_read_msr_amd_safe(unsigned int msr, - int *err) -{ - DECLARE_ARGS(val, low, high); - - asm volatile("2: rdmsr ; xor %0,%0\n" - "1:\n\t" - ".section .fixup,\"ax\"\n\t" - "3: mov %3,%0 ; jmp 1b\n\t" - ".previous\n\t" - _ASM_EXTABLE(2b, 3b) - : "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT)); - return EAX_EDX_VAL(val, low, high); -} - static inline void native_write_msr(unsigned int msr, unsigned low, unsigned high) { @@ -184,14 +168,34 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = native_read_msr_safe(msr, &err); return err; } + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { + u32 gprs[8] = { 0 }; int err; - *p = native_read_msr_amd_safe(msr, &err); + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = native_rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + return err; } +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return native_wrmsr_safe_regs(gprs); +} + static inline int rdmsr_safe_regs(u32 *regs) { return native_rdmsr_safe_regs(regs); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 1705944e037..11574934a99 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -166,7 +166,6 @@ struct pv_cpu_ops { /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ - u64 (*read_msr_amd)(unsigned int msr, int *err); u64 (*read_msr)(unsigned int msr, int *err); int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); @@ -828,10 +827,6 @@ static inline int paravirt_rdmsr_regs(u32 *regs) return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); } -static inline u64 paravirt_read_msr_amd(unsigned msr, int *err) -{ - return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err); -} static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); @@ -887,12 +882,31 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) } static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) { + u32 gprs[8] = { 0 }; int err; - *p = paravirt_read_msr_amd(msr, &err); + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = paravirt_rdmsr_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + return err; } +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return paravirt_wrmsr_regs(gprs); +} + static inline u64 paravirt_read_tsc(void) { return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 67594af43b3..f5b0b4a01fb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -363,7 +363,6 @@ struct pv_cpu_ops pv_cpu_ops = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .rdmsr_regs = native_rdmsr_safe_regs, - .read_msr_amd = native_read_msr_amd_safe, .write_msr = native_write_msr_safe, .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, -- cgit v1.2.3-70-g09d2 From 6b0f43ddfa358dc71ad2a2d57bce5906c1c5dc1a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 31 Aug 2009 09:50:11 +0200 Subject: x86, AMD: Disable wrongly set X86_FEATURE_LAHF_LM CPUID bit fbd8b1819e80ac5a176d085fdddc3a34d1499318 turns off the bit for /proc/cpuinfo. However, a proper/full fix would be to additionally turn off the bit in the CPUID output so that future callers get correct CPU features info. Do that by basically reversing what the BIOS wrongfully does at boot. Signed-off-by: Borislav Petkov LKML-Reference: <1251705011-18636-3-git-send-email-petkovbb@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 63fddcd082c..0a717fc6aeb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -404,9 +404,18 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* * Some BIOSes incorrectly force this feature, but only K8 * revision D (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14) + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + u64 val; + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &val)) { + val &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, val); + } + } + } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); -- cgit v1.2.3-70-g09d2 From fe9b4e4e40ffdabbd385cdf171cb861c2fd517c0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 11:53:23 -0700 Subject: x86, asm: Add 32-bit versions of the combined CFI macros Add 32-bit versions of the combined CFI macros, equivalent to the 64-bit ones except, obviously, operating on 32-bit stack words. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/dwarf2.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 3afc5e87cfd..ae6253ab902 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -87,9 +87,25 @@ CFI_RESTORE \reg .endm #else /*!CONFIG_X86_64*/ + .macro pushl_cfi reg + pushl \reg + CFI_ADJUST_CFA_OFFSET 4 + .endm - /* 32bit defenitions are missed yet */ + .macro popl_cfi reg + popl \reg + CFI_ADJUST_CFA_OFFSET -4 + .endm + .macro movl_cfi reg offset=0 + movl %\reg, \offset(%esp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movl_cfi_restore offset reg + movl \offset(%esp), %\reg + CFI_RESTORE \reg + .endm #endif /*!CONFIG_X86_64*/ #endif /*__ASSEMBLY__*/ -- cgit v1.2.3-70-g09d2 From 709972b1f6f70535d1fddbe1243a51b90c408a1c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 11:57:20 -0700 Subject: x86, asm: Make _ASM_EXTABLE() usable from assembly code We have had this convenient macro _ASM_EXTABLE() to generate exception table entry in inline assembly. Make it also usable for pure assembly. Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/asm.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 56be78f582f..b3ed1e1460f 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -3,7 +3,7 @@ #ifdef __ASSEMBLY__ # define __ASM_FORM(x) x -# define __ASM_EX_SEC .section __ex_table +# define __ASM_EX_SEC .section __ex_table, "a" #else # define __ASM_FORM(x) " " #x " " # define __ASM_EX_SEC " .section __ex_table,\"a\"\n" @@ -38,10 +38,18 @@ #define _ASM_DI __ASM_REG(di) /* Exception table entry */ +#ifdef __ASSEMBLY__ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC ; \ + _ASM_ALIGN ; \ + _ASM_PTR from , to ; \ + .previous +#else # define _ASM_EXTABLE(from,to) \ __ASM_EX_SEC \ _ASM_ALIGN "\n" \ _ASM_PTR #from "," #to "\n" \ " .previous\n" +#endif #endif /* _ASM_X86_ASM_H */ -- cgit v1.2.3-70-g09d2 From 79c5dca3619d6ae15815eec14cd7a43db5f38b47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 13:59:53 -0700 Subject: x86, msr: CFI annotations, cleanups for msr-reg.S Add CFI annotations for native_{rd,wr}msr_safe_regs(). Simplify the 64-bit implementation: we don't allow the upper half registers to be set, and so we can use them to carry state across the operation. Signed-off-by: H. Peter Anvin Cc: Borislav Petkov LKML-Reference: <1251705011-18636-1-git-send-email-petkovbb@gmail.com> --- arch/x86/lib/msr-reg.S | 80 ++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 51f1bb3f8c7..9e8cdcf5d73 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -12,10 +13,11 @@ */ .macro op_safe_regs op:req ENTRY(native_\op\()_safe_regs) - push %rbx - push %rbp - push $0 /* Return value */ - push %rdi + CFI_STARTPROC + pushq_cfi %rbx + pushq_cfi %rbp + movq %rdi, %r10 /* Save pointer */ + xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax movl 4(%rdi), %ecx movl 8(%rdi), %edx @@ -23,27 +25,26 @@ ENTRY(native_\op\()_safe_regs) movl 20(%rdi), %ebp movl 24(%rdi), %esi movl 28(%rdi), %edi + CFI_REMEMBER_STATE 1: \op -2: movl %edi, %r10d - pop %rdi - movl %eax, (%rdi) - movl %ecx, 4(%rdi) - movl %edx, 8(%rdi) - movl %ebx, 12(%rdi) - movl %ebp, 20(%rdi) - movl %esi, 24(%rdi) - movl %r10d, 28(%rdi) - pop %rax - pop %rbp - pop %rbx +2: movl %eax, (%r10) + movl %r11d, %eax /* Return value */ + movl %ecx, 4(%r10) + movl %edx, 8(%r10) + movl %ebx, 12(%r10) + movl %ebp, 20(%r10) + movl %esi, 24(%r10) + movl %edi, 28(%r10) + popq_cfi %rbp + popq_cfi %rbx ret 3: - movq $-EIO, 8(%rsp) + CFI_RESTORE_STATE + movl $-EIO, %r11d jmp 2b - .section __ex_table,"ax" - .balign 4 - .quad 1b, 3b - .previous + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC ENDPROC(native_\op\()_safe_regs) .endm @@ -51,12 +52,13 @@ ENDPROC(native_\op\()_safe_regs) .macro op_safe_regs op:req ENTRY(native_\op\()_safe_regs) - push %ebx - push %ebp - push %esi - push %edi - push $0 /* Return value */ - push %eax + CFI_STARTPROC + pushl_cfi %ebx + pushl_cfi %ebp + pushl_cfi %esi + pushl_cfi %edi + pushl_cfi $0 /* Return value */ + pushl_cfi %eax movl 4(%eax), %ecx movl 8(%eax), %edx movl 12(%eax), %ebx @@ -64,30 +66,32 @@ ENTRY(native_\op\()_safe_regs) movl 24(%eax), %esi movl 28(%eax), %edi movl (%eax), %eax + CFI_REMEMBER_STATE 1: \op -2: push %eax +2: pushl_cfi %eax movl 4(%esp), %eax - pop (%eax) + popl_cfi (%eax) addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 movl %ecx, 4(%eax) movl %edx, 8(%eax) movl %ebx, 12(%eax) movl %ebp, 20(%eax) movl %esi, 24(%eax) movl %edi, 28(%eax) - pop %eax - pop %edi - pop %esi - pop %ebp - pop %ebx + popl_cfi %eax + popl_cfi %edi + popl_cfi %esi + popl_cfi %ebp + popl_cfi %ebx ret 3: + CFI_RESTORE_STATE movl $-EIO, 4(%esp) jmp 2b - .section __ex_table,"ax" - .balign 4 - .long 1b, 3b - .previous + + _ASM_EXTABLE(1b, 3b) + CFI_ENDPROC ENDPROC(native_\op\()_safe_regs) .endm -- cgit v1.2.3-70-g09d2 From 0cc0213e73af5963eca259c84876937c20689dbd Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 14:23:29 -0700 Subject: x86, msr: Have the _safe MSR functions return -EIO, not -EFAULT For some reason, the _safe MSR functions returned -EFAULT, not -EIO. However, the only user which cares about the return code as anything other than a boolean is the MSR driver, which wants -EIO. Change it to -EIO across the board. Signed-off-by: H. Peter Anvin Cc: Jeremy Fitzhardinge Cc: Chris Wright Cc: Alok Kataria Cc: Rusty Russell --- arch/x86/include/asm/msr.h | 4 ++-- arch/x86/kernel/msr.c | 10 ++-------- arch/x86/xen/enlighten.c | 2 +- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 09c5ca70d49..943fdd572e1 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -67,7 +67,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, ".previous\n\t" _ASM_EXTABLE(2b, 3b) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) - : "c" (msr), [fault] "i" (-EFAULT)); + : "c" (msr), [fault] "i" (-EIO)); return EAX_EDX_VAL(val, low, high); } @@ -90,7 +90,7 @@ notrace static inline int native_write_msr_safe(unsigned int msr, _ASM_EXTABLE(2b, 3b) : [err] "=a" (err) : "c" (msr), "0" (low), "d" (high), - [fault] "i" (-EFAULT) + [fault] "i" (-EIO) : "memory"); return err; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 98fd6cd4e3a..2cfbb4b2c42 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -80,11 +80,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } if (copy_to_user(tmp, &data, 8)) { err = -EFAULT; break; @@ -115,11 +112,8 @@ static ssize_t msr_write(struct file *file, const char __user *buf, break; } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) { - if (err == -EFAULT) /* Fix idiotic error code */ - err = -EIO; + if (err) break; - } tmp += 2; bytes += 8; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0a1700a2be9..a8432d81690 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -713,7 +713,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) set: base = ((u64)high << 32) | low; if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EFAULT; + ret = -EIO; break; #endif -- cgit v1.2.3-70-g09d2 From 8b956bf1f0f2b552ed93cf6cafe823edff298b3b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 14:13:48 -0700 Subject: x86, msr: Create _on_cpu helpers for {rw,wr}msr_safe_regs() Create _on_cpu helpers for {rw,wr}msr_safe_regs() analogously with the other MSR functions. This will be necessary to add support for these to the MSR driver. Signed-off-by: H. Peter Anvin Cc: Borislav Petkov --- arch/x86/include/asm/msr.h | 18 +++++++++++++---- arch/x86/lib/msr.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 943fdd572e1..8e56712aa17 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -97,8 +97,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 *regs); -extern int native_wrmsr_safe_regs(u32 *regs); +extern int native_rdmsr_safe_regs(u32 regs[8]); +extern int native_wrmsr_safe_regs(u32 regs[8]); static __always_inline unsigned long long __native_read_tsc(void) { @@ -196,12 +196,12 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) return native_wrmsr_safe_regs(gprs); } -static inline int rdmsr_safe_regs(u32 *regs) +static inline int rdmsr_safe_regs(u32 regs[8]) { return native_rdmsr_safe_regs(regs); } -static inline int wrmsr_safe_regs(u32 *regs) +static inline int wrmsr_safe_regs(u32 regs[8]) { return native_wrmsr_safe_regs(regs); } @@ -245,6 +245,8 @@ void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { @@ -275,6 +277,14 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } +static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) +{ + return rdmsr_safe_regs(regs); +} +static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) +{ + return wrmsr_safe_regs(regs); +} #endif /* CONFIG_SMP */ #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index caa24aca811..33a1e3ca22d 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -175,3 +175,52 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +struct msr_regs_info { + u32 *regs; + int err; +}; + +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); -- cgit v1.2.3-70-g09d2 From ff55df53dfdd338906c8ba9d1f4a759b86b869d5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 14:16:57 -0700 Subject: x86, msr: Export the register-setting MSR functions via /dev/*/msr Make it possible to access the all-register-setting/getting MSR functions via the MSR driver. This is implemented as an ioctl() on the standard MSR device node. Signed-off-by: H. Peter Anvin Cc: Borislav Petkov --- Documentation/ioctl/ioctl-number.txt | 1 + arch/x86/include/asm/msr.h | 10 +++++-- arch/x86/kernel/msr.c | 51 ++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index dbea4f95fc8..1c058b552e9 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -121,6 +121,7 @@ Code Seq# Include File Comments 'c' 00-7F linux/comstats.h conflict! 'c' 00-7F linux/coda.h conflict! 'c' 80-9F arch/s390/include/asm/chsc.h +'c' A0-AF arch/x86/include/asm/msr.h 'd' 00-FF linux/char/drm/drm/h conflict! 'd' F0-FF linux/digi1.h 'e' all linux/digi1.h conflict! diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 8e56712aa17..7e2b6ba962f 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -3,10 +3,16 @@ #include -#ifdef __KERNEL__ #ifndef __ASSEMBLY__ #include +#include + +#define X86_IOC_RDMSR_REGS _IOWR('c', 0xA0, __u32[8]) +#define X86_IOC_WRMSR_REGS _IOWR('c', 0xA1, __u32[8]) + +#ifdef __KERNEL__ + #include #include #include @@ -286,6 +292,6 @@ static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_MSR_H */ diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2cfbb4b2c42..7dd95009417 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,7 @@ /* ----------------------------------------------------------------------- * * * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -121,6 +122,54 @@ static ssize_t msr_write(struct file *file, const char __user *buf, return bytes ? bytes : err; } +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + static int msr_open(struct inode *inode, struct file *file) { unsigned int cpu = iminor(file->f_path.dentry->d_inode); @@ -151,6 +200,8 @@ static const struct file_operations msr_fops = { .read = msr_read, .write = msr_write, .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, }; static int __cpuinit msr_device_create(int cpu) -- cgit v1.2.3-70-g09d2 From acde31dc467797ccae3a55b791a77af446cce018 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 27 Aug 2009 14:29:20 +0100 Subject: kmemleak: Ignore the aperture memory hole on x86_64 This block is allocated with alloc_bootmem() and scanned by kmemleak but the kernel direct mapping may no longer exist. This patch tells kmemleak to ignore this memory hole. The dma32_bootmem_ptr in dma32_reserve_bootmem() is also ignored. Signed-off-by: Catalin Marinas Acked-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 6 ++++++ arch/x86/kernel/pci-dma.c | 6 ++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 676debfc170..128111d8ffe 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,11 @@ static u32 __init allocate_aperture(void) * code for safe */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(p); if (!p || __pa(p)+aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%p,%uK)\n", diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506..fa80f60e960 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,11 @@ void __init dma32_reserve_bootmem(void) size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); + /* + * Kmemleak should not scan this block as it may not be mapped via the + * kernel direct mapping. + */ + kmemleak_ignore(dma32_bootmem_ptr); if (dma32_bootmem_ptr) dma32_bootmem_size = size; else -- cgit v1.2.3-70-g09d2 From db39d5529d347de5e2eec1a72d67fcfacae6c5a2 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Fri, 21 Aug 2009 19:15:28 -0500 Subject: [CPUFREQ] Powernow-k8: Enable more than 2 low P-states Remove an obsolete check that used to prevent there being more than 2 low P-states. Now that low-to-low P-states changes are enabled, it prevents otherwise workable configurations with multiple low P-states. Signed-off-by: Mark Langsdorf Tested-by: Krists Krilovs Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 2a50ef89100..0cbce0481a5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -854,6 +854,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) goto err_out; } + /* fill in data */ + data->numps = data->acpi_data.state_count; + powernow_k8_acpi_pst_values(data, 0); + if (cpu_family == CPU_HW_PSTATE) ret_val = fill_powernow_table_pstate(data, powernow_table); else @@ -866,11 +870,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; - /* fill in data */ - data->numps = data->acpi_data.state_count; if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) print_basics(data); - powernow_k8_acpi_pst_values(data, 0); /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); @@ -941,7 +942,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table) { int i; - int cntlofreq = 0; for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; @@ -982,27 +982,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, continue; } - /* verify only 1 entry from the lo frequency table */ - if (fid < HI_FID_TABLE_BOTTOM) { - if (cntlofreq) { - /* if both entries are the same, - * ignore this one ... */ - if ((freq != powernow_table[cntlofreq].frequency) || - (index != powernow_table[cntlofreq].index)) { - printk(KERN_ERR PFX - "Too many lo freq table " - "entries\n"); - return 1; - } - - dprintk("double low frequency table entry, " - "ignoring it.\n"); - invalidate_entry(data, i); - continue; - } else - cntlofreq = i; - } - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries " "%u kHz vs. %u kHz\n", freq, -- cgit v1.2.3-70-g09d2 From 1a8e42fa81e62d47cc471f7764f906bb42b27a54 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 26 Aug 2009 13:19:37 -0400 Subject: [CPUFREQ] Create a blacklist for processors that should not load the acpi-cpufreq module. Create a blacklist for processors that should not load the acpi-cpufreq module. The initial entry in the blacklist function is the Intel 0f68 processor. It's specification update mentions errata AL30 which implies that cpufreq should not run on this processor. Signed-off-by: Prarit Bhargava Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ae9b503220c..badce508406 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -588,6 +588,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { }, { } }; + +static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) +{ + /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + * AL30: A Machine Check Exception (MCE) Occurring during an + * Enhanced Intel SpeedStep Technology Ratio Change May Cause + * Both Processor Cores to Lock Up when HT is enabled*/ + if (c->x86_vendor == X86_VENDOR_INTEL) { + if ((c->x86 == 15) && + (c->x86_model == 6) && + (c->x86_mask == 8) && smt_capable()) + return -ENODEV; + } + return 0; +} #endif static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) @@ -602,6 +617,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) dprintk("acpi_cpufreq_cpu_init\n"); +#ifdef CONFIG_SMP + result = acpi_cpufreq_blacklist(c); + if (result) + return result; +#endif + data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); if (!data) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From f6909f394c2d4a0a71320797df72d54c49c5927e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 1 Sep 2009 13:31:52 -0700 Subject: x86, msr: fix msr-reg.S compilation with gas 2.16.1 msr-reg.S used the :req option on a macro argument, which wasn't supported by gas 2.16.1 (but apparently by some earlier versions of gas, just to be confusing.) It isn't necessary, so just remove it. Signed-off-by: H. Peter Anvin Cc: Borislav Petkov --- arch/x86/lib/msr-reg.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 9e8cdcf5d73..d5eaf53aa67 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -11,7 +11,7 @@ * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ -.macro op_safe_regs op:req +.macro op_safe_regs op ENTRY(native_\op\()_safe_regs) CFI_STARTPROC pushq_cfi %rbx -- cgit v1.2.3-70-g09d2 From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 + arch/x86/include/asm/tboot.h | 197 ------------------------------------------ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 ++++++++++--- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- include/linux/tboot.h | 162 ++++++++++++++++++++++++++++++++++ init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 13 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/x86/include/asm/tboot.h create mode 100644 include/linux/tboot.h (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8..b66f2102c35 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h deleted file mode 100644 index b13929d4e5f..00000000000 --- a/arch/x86/include/asm/tboot.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * tboot.h: shared data structure with tboot and kernel and functions - * used by kernel for runtime support of Intel(R) Trusted - * Execution Technology - * - * Copyright (c) 2006-2009, Intel Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef _ASM_TBOOT_H -#define _ASM_TBOOT_H - -#include - -/* these must have the values from 0-5 in this order */ -enum { - TB_SHUTDOWN_REBOOT = 0, - TB_SHUTDOWN_S5, - TB_SHUTDOWN_S4, - TB_SHUTDOWN_S3, - TB_SHUTDOWN_HALT, - TB_SHUTDOWN_WFS -}; - -#ifdef CONFIG_INTEL_TXT - -/* used to communicate between tboot and the launched kernel */ - -#define TB_KEY_SIZE 64 /* 512 bits */ - -#define MAX_TB_MAC_REGIONS 32 - -struct tboot_mac_region { - u64 start; /* must be 64 byte -aligned */ - u32 size; /* must be 64 byte -granular */ -} __packed; - -/* GAS - Generic Address Structure (ACPI 2.0+) */ -struct tboot_acpi_generic_address { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_width; - u64 address; -} __packed; - -/* - * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec - * (http://www.acpi.info/) - */ -struct tboot_acpi_sleep_info { - struct tboot_acpi_generic_address pm1a_cnt_blk; - struct tboot_acpi_generic_address pm1b_cnt_blk; - struct tboot_acpi_generic_address pm1a_evt_blk; - struct tboot_acpi_generic_address pm1b_evt_blk; - u16 pm1a_cnt_val; - u16 pm1b_cnt_val; - u64 wakeup_vector; - u32 vector_width; - u64 kernel_s3_resume_vector; -} __packed; - -/* - * shared memory page used for communication between tboot and kernel - */ -struct tboot { - /* - * version 3+ fields: - */ - - /* TBOOT_UUID */ - u8 uuid[16]; - - /* version number: 5 is current */ - u32 version; - - /* physical addr of tb_log_t log */ - u32 log_addr; - - /* - * physical addr of entry point for tboot shutdown and - * type of shutdown (TB_SHUTDOWN_*) being requested - */ - u32 shutdown_entry; - u32 shutdown_type; - - /* kernel-specified ACPI info for Sx shutdown */ - struct tboot_acpi_sleep_info acpi_sinfo; - - /* tboot location in memory (physical) */ - u32 tboot_base; - u32 tboot_size; - - /* memory regions (phys addrs) for tboot to MAC on S3 */ - u8 num_mac_regions; - struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; - - - /* - * version 4+ fields: - */ - - /* symmetric key for use by kernel; will be encrypted on S3 */ - u8 s3_key[TB_KEY_SIZE]; - - - /* - * version 5+ fields: - */ - - /* used to 4byte-align num_in_wfs */ - u8 reserved_align[3]; - - /* number of processors in wait-for-SIPI */ - u32 num_in_wfs; -} __packed; - -/* - * UUID for tboot data struct to facilitate matching - * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is - * represented as {} in the char array used here - */ -#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ - 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} - -extern struct tboot *tboot; - -static inline int tboot_enabled(void) -{ - return tboot != NULL; -} - -extern void tboot_probe(void); -extern void tboot_create_trampoline(void); -extern void tboot_shutdown(u32 shutdown_type); -extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); -extern int tboot_wait_for_aps(int num_aps); -extern struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl); -extern int tboot_force_iommu(void); - -#else /* CONFIG_INTEL_TXT */ - -static inline int tboot_enabled(void) -{ - return 0; -} - -static inline void tboot_probe(void) -{ -} - -static inline void tboot_create_trampoline(void) -{ -} - -static inline void tboot_shutdown(u32 shutdown_type) -{ -} - -static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, - u32 pm1b_control) -{ -} - -static inline int tboot_wait_for_aps(int num_aps) -{ - return 0; -} - -static inline struct acpi_table_header *tboot_get_dmar_table( - struct acpi_table_header *dmar_tbl) -{ - return dmar_tbl; -} - -static inline int tboot_force_iommu(void) -{ - return 0; -} - -#endif /* !CONFIG_INTEL_TXT */ - -#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d979..18ce5c04242 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e3248..6ce0d6f38f7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include