From a47473939db20e3961b200eb00acf5fcf084d755 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Oct 2012 14:53:11 +0200 Subject: perf/x86: Make hardware event translations available in sysfs Add support to display hardware events translations available through the sysfs. Add 'events' group attribute under the sysfs x86 PMU record with attribute/file for each hardware event. This patch adds only backbone for PMUs to display config under 'events' directory. The specific PMU support itself will come in next patches, however this is how the sysfs group will look like: # ls /sys/devices/cpu/events/ branch-instructions branch-misses bus-cycles cache-misses cache-references cpu-cycles instructions ref-cycles stalled-cycles-backend stalled-cycles-frontend The file - hw event ID mapping is: file hw event ID --------------------------------------------------------------- cpu-cycles PERF_COUNT_HW_CPU_CYCLES instructions PERF_COUNT_HW_INSTRUCTIONS cache-references PERF_COUNT_HW_CACHE_REFERENCES cache-misses PERF_COUNT_HW_CACHE_MISSES branch-instructions PERF_COUNT_HW_BRANCH_INSTRUCTIONS branch-misses PERF_COUNT_HW_BRANCH_MISSES bus-cycles PERF_COUNT_HW_BUS_CYCLES stalled-cycles-frontend PERF_COUNT_HW_STALLED_CYCLES_FRONTEND stalled-cycles-backend PERF_COUNT_HW_STALLED_CYCLES_BACKEND ref-cycles PERF_COUNT_HW_REF_CPU_CYCLES Each file in the 'events' directory contains the term translation for the symbolic hw event for the currently running cpu model. # cat /sys/devices/cpu/events/stalled-cycles-backend event=0xb1,umask=0x01,inv,cmask=0x01 Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Stephane Eranian Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1349873598-12583-2-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 60 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.h | 2 ++ 2 files changed, 62 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4a3374e61a9..9fa4c45ecad 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1316,6 +1316,62 @@ static struct attribute_group x86_pmu_format_group = { .attrs = NULL, }; +struct perf_pmu_events_attr { + struct device_attribute attr; + u64 id; +}; + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + + u64 config = x86_pmu.event_map(pmu_attr->id); + return x86_pmu.events_sysfs_show(page, config); +} + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ +}; + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1362,6 +1418,9 @@ static int __init init_hw_perf_events(void) x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1651,6 +1710,7 @@ static struct attribute_group x86_pmu_attr_group = { static const struct attribute_group *x86_pmu_attr_groups[] = { &x86_pmu_attr_group, &x86_pmu_format_group, + &x86_pmu_events_group, NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 271d2570029..6f75b6a7f37 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -354,6 +354,8 @@ struct x86_pmu { int attr_rdpmc; struct attribute **format_attrs; + ssize_t (*events_sysfs_show)(char *page, u64 config); + /* * CPU Hotplug hooks */ -- cgit v1.2.3-70-g09d2 From 8300daa26755c9a194776778bd822acf1fa2dbf6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Oct 2012 14:53:12 +0200 Subject: perf/x86: Filter out undefined events from sysfs events attribute The sysfs events group attribute currently shows all hw events, including also undefined ones. This patch filters out all undefined events out of the sysfs events group attribute, so they don't even show up. Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1349873598-12583-3-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 9fa4c45ecad..39737a678a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1321,6 +1321,26 @@ struct perf_pmu_events_attr { u64 id; }; +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + int i, j; + + for (i = 0; attrs[i]; i++) { + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { @@ -1420,6 +1440,8 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); -- cgit v1.2.3-70-g09d2 From 43c032febde48aabcf6d59f47cdcb7b5debbdc63 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Oct 2012 14:53:13 +0200 Subject: perf/x86: Add hardware events translations for Intel cpus Add support for Intel processors to display 'events' sysfs directory (/sys/devices/cpu/events/) with hw event translations: # ls /sys/devices/cpu/events/ branch-instructions branch-misses bus-cycles cache-misses cache-references cpu-cycles instructions ref-cycles stalled-cycles-backend stalled-cycles-frontend Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Stephane Eranian Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1349873598-12583-4-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 40 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/perf_event.h | 2 ++ arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ 3 files changed, 44 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 39737a678a8..8a1fa23452d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1392,6 +1392,46 @@ static struct attribute_group x86_pmu_events_group = { .attrs = events_attr, }; +ssize_t x86_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6f75b6a7f37..f8aa2f6677f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -538,6 +538,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +ssize_t x86_event_sysfs_show(char *page, u64 config); + #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 324bb523d9d..6106d3b44aa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1628,6 +1628,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = x86_event_sysfs_show, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1766,6 +1767,7 @@ static __initconst const struct x86_pmu intel_pmu = { .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = x86_event_sysfs_show, .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, -- cgit v1.2.3-70-g09d2 From 0bf79d44133de42af01a70a1700b8bb4b6d3fb92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Oct 2012 14:53:14 +0200 Subject: perf/x86: Add hardware events translations for AMD cpus Add support for AMD processors to display 'events' sysfs directory (/sys/devices/cpu/events/) with hw event translations: # ls /sys/devices/cpu/events/ branch-instructions branch-misses bus-cycles cache-misses cache-references cpu-cycles instructions ref-cycles stalled-cycles-backend stalled-cycles-frontend Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Stephane Eranian Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1349873598-12583-5-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +-- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/cpu/perf_event_amd.c | 9 +++++++++ arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++++-- 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8a1fa23452d..0a55ab2ff84 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1392,9 +1392,8 @@ static struct attribute_group x86_pmu_events_group = { .attrs = events_attr, }; -ssize_t x86_event_sysfs_show(char *page, u64 config) +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) { - u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f8aa2f6677f..21419b9178b 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -538,7 +538,7 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } -ssize_t x86_event_sysfs_show(char *page, u64 config); +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4528ae7b6ec..c93bc4e813a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -568,6 +568,14 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, @@ -591,6 +599,7 @@ static __initconst const struct x86_pmu amd_pmu = { .put_event_constraints = amd_put_event_constraints, .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6106d3b44aa..93b9e1181f8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1603,6 +1603,13 @@ static struct attribute *intel_arch_formats_attr[] = { NULL, }; +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, @@ -1628,7 +1635,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, .guest_get_msrs = core_guest_get_msrs, .format_attrs = intel_arch_formats_attr, - .events_sysfs_show = x86_event_sysfs_show, + .events_sysfs_show = intel_event_sysfs_show, }; struct intel_shared_regs *allocate_shared_regs(int cpu) @@ -1767,7 +1774,7 @@ static __initconst const struct x86_pmu intel_pmu = { .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, - .events_sysfs_show = x86_event_sysfs_show, + .events_sysfs_show = intel_event_sysfs_show, .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, -- cgit v1.2.3-70-g09d2 From 20550a434583c78f8ff9a2819639e2bacbe58574 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 10 Oct 2012 14:53:15 +0200 Subject: perf/x86: Add hardware events translations for Intel P6 cpus Add support for Intel P6 processors to display 'events' sysfs directory (/sys/devices/cpu/events/) with hw event translations: # ls /sys/devices/cpu/events/ branch-instructions branch-misses bus-cycles cache-misses cache-references cpu-cycles instructions ref-cycles stalled-cycles-backend stalled-cycles-frontend Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Vince Weaver Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1349873598-12583-6-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_p6.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 21419b9178b..115c1ea9774 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -539,6 +539,7 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) } ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a045..900b76b5d6e 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -134,6 +134,8 @@ static __initconst const struct x86_pmu p6_pmu = { .event_constraints = p6_event_constraints, .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + }; __init int p6_pmu_init(void) -- cgit v1.2.3-70-g09d2 From ce37f400336a34bb6e72c4700f9dcc2a41ff7163 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 8 Oct 2012 13:07:30 +0100 Subject: x86: Allow tracing of functions in arch/x86/kernel/rtc.c Move native_read_tsc() to tsc.c to allow profiling to be re-enabled for rtc.c. Signed-off-by: David Vrabel Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1349698050-6560-1-git-send-email-david.vrabel@citrix.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 - arch/x86/kernel/rtc.c | 6 ------ arch/x86/kernel/tsc.c | 6 ++++++ 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 91ce48f05f9..9fd5eed3f8f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -9,7 +9,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg -CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg CFLAGS_REMOVE_kvmclock.o = -pg diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 4929c1be0ac..801602b5d74 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -195,12 +195,6 @@ void read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - static struct resource rtc_resources[] = { [0] = { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index cfa5d4f7ca5..06ccb5073a3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -77,6 +77,12 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + int check_tsc_unstable(void) { return tsc_unstable; -- cgit v1.2.3-70-g09d2 From 4d0e42cc66f4e7e0bf08b29da1ae6ebd60549c4e Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 25 Oct 2012 18:13:11 +0200 Subject: x86: Remove dead hlt_use_halt code The hlt_use_halt function returns always true and there is only one definition of it. The default_idle function can then get ride of the if ... statement and we can remove the else branch. Signed-off-by: Daniel Lezcano Cc: linaro-dev@lists.linaro.org Cc: patches@linaro.org Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1351181591-8710-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b644e1c765d..2f99e312187 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -306,11 +306,6 @@ void (*pm_idle)(void); EXPORT_SYMBOL(pm_idle); #endif -static inline int hlt_use_halt(void) -{ - return 1; -} - #ifndef CONFIG_SMP static inline void play_dead(void) { @@ -410,28 +405,22 @@ void cpu_idle(void) */ void default_idle(void) { - if (hlt_use_halt()) { - trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); - trace_cpu_idle_rcuidle(1, smp_processor_id()); - current_thread_info()->status &= ~TS_POLLING; - /* - * TS_POLLING-cleared state must be visible before we - * test NEED_RESCHED: - */ - smp_mb(); + trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle_rcuidle(1, smp_processor_id()); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); - if (!need_resched()) - safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); - current_thread_info()->status |= TS_POLLING; - trace_power_end_rcuidle(smp_processor_id()); - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); - } else { + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else local_irq_enable(); - /* loop is done by the caller */ - cpu_relax(); - } + current_thread_info()->status |= TS_POLLING; + trace_power_end_rcuidle(smp_processor_id()); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); -- cgit v1.2.3-70-g09d2 From 2c5594df344cd1ff0cc9bf007dea3235582b3acf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 26 Oct 2012 11:40:28 +0200 Subject: rcu: Fix unrecovered RCU user mode in syscall_trace_leave() On x86-64 syscall exit, 3 non exclusive events may happen looping in the following order: 1) Check if we need resched for user preemption, if so call schedule_user() 2) Check if we have pending signals, if so call do_notify_resume() 3) Check if we do syscall tracing, if so call syscall_trace_leave() However syscall_trace_leave() has been written assuming it directly follows the syscall and forget about the above possible 1st and 2nd steps. Now schedule_user() and do_notify_resume() exit in RCU user mode because they have most chances to resume userspace immediately and this avoids an rcu_user_enter() call in the syscall fast path. So by the time we call syscall_trace_leave(), we may well be in RCU user mode. To fix this up, simply call rcu_user_exit() in the beginning of this function. This fixes some reported RCU uses in extended quiescent state. Reported-by: Dave Jones Reported-by: Sergey Senozhatsky Signed-off-by: Frederic Weisbecker Tested-by: Sergey Senozhatsky Signed-off-by: Paul E. McKenney --- arch/x86/kernel/ptrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a1839..eff5b8c6865 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1511,6 +1511,13 @@ void syscall_trace_leave(struct pt_regs *regs) { bool step; + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + rcu_user_exit(); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) -- cgit v1.2.3-70-g09d2 From e6d41e8c697e07832efa4a85bf23438bc4c4e1b2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 29 Oct 2012 18:40:08 +0100 Subject: x86, AMD: Change Boris' email address Move to private email and put in maintained status. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1351532410-4887-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 698b6ec12e0..1ac581f38df 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -6,7 +6,7 @@ * * Written by Jacob Shin - AMD, Inc. * - * Support: borislav.petkov@amd.com + * Maintained by: Borislav Petkov * * April 2006 * - added support for AMD Family 0x10 processors -- cgit v1.2.3-70-g09d2 From 943482d07e926128eed0482b879736f912c429e4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Mon, 29 Oct 2012 18:51:38 +0100 Subject: x86, microcode_amd: Change email addresses, MAINTAINERS entry Signed-off-by: Andreas Herrmann Cc: lm-sensors@lm-sensors.org Cc: oprofile-list@lists.sf.net Cc: Stephane Eranian Cc: Robert Richter Cc: Borislav Petkov Cc: Jorg Roedel Cc: Rafael J. Wysocki Cc: Jean Delvare Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20121029175138.GC5024@tweety Signed-off-by: Ingo Molnar --- MAINTAINERS | 4 ++-- arch/x86/kernel/microcode_amd.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/MAINTAINERS b/MAINTAINERS index e775b874920..17403702f56 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -534,9 +534,9 @@ F: drivers/iommu/amd_iommu*.[ch] F: include/linux/amd-iommu.h AMD MICROCODE UPDATE SUPPORT -M: Andreas Herrmann +M: Andreas Herrmann L: amd64-microcode@amd64.org -S: Supported +S: Maintained F: arch/x86/kernel/microcode_amd.c AMS (Apple Motion Sensor) DRIVER diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7720ff5a9ee..b3e67ba55b7 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -8,8 +8,8 @@ * Tigran Aivazian * * Maintainers: - * Andreas Herrmann - * Borislav Petkov + * Andreas Herrmann + * Borislav Petkov * * This driver allows to upgrade microcode on F10h AMD * CPUs and later. -- cgit v1.2.3-70-g09d2 From 95d18aa2b6c05351181934b3bc34ce038cc7b637 Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Mon, 29 Oct 2012 21:48:17 +0100 Subject: perf/x86: Fix sparse warnings FYI, there are new sparse warnings: arch/x86/kernel/cpu/perf_event.c:1356:18: sparse: symbol 'events_attr' was not declared. Should it be static? This patch makes it static and also adds the static keyword to fix arch/x86/kernel/cpu/perf_event.c:1344:9: warning: symbol 'events_sysfs_show' was not declared. Signed-off-by: Peter Huewe Cc: Peter Zijlstra Cc: Yuanhan Liu Cc: fengguang.wu@intel.com Cc: Jiri Olsa Link: http://lkml.kernel.org/n/tip-lerdpXlnruh0yvWs2owwuizl@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a55ab2ff84..4428fd178bc 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1341,7 +1341,7 @@ static void __init filter_events(struct attribute **attrs) } } -ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, +static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ @@ -1373,7 +1373,7 @@ EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); static struct attribute *empty_attrs; -struct attribute *events_attr[] = { +static struct attribute *events_attr[] = { EVENT_PTR(CPU_CYCLES), EVENT_PTR(INSTRUCTIONS), EVENT_PTR(CACHE_REFERENCES), -- cgit v1.2.3-70-g09d2 From 85b97637bb40a9f486459dd254598759af9c3d50 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Mon, 29 Oct 2012 11:01:50 +0800 Subject: x86/mce: Do not change worker's running cpu in cmci_rediscover(). cmci_rediscover() used set_cpus_allowed_ptr() to change the current process's running cpu, and migrate itself to the dest cpu. But worker processes are not allowed to be migrated. If current is a worker, the worker will be migrated to another cpu, but the corresponding worker_pool is still on the original cpu. In this case, the following BUG_ON in try_to_wake_up_local() will be triggered: BUG_ON(rq != this_rq()); This will cause the kernel panic. The call trace is like the following: [ 6155.451107] ------------[ cut here ]------------ [ 6155.452019] kernel BUG at kernel/sched/core.c:1654! ...... [ 6155.452019] RIP: 0010:[] [] try_to_wake_up_local+0x115/0x130 ...... [ 6155.452019] Call Trace: [ 6155.452019] [] __schedule+0x764/0x880 [ 6155.452019] [] schedule+0x29/0x70 [ 6155.452019] [] schedule_timeout+0x235/0x2d0 [ 6155.452019] [] ? mark_held_locks+0x8d/0x140 [ 6155.452019] [] ? __lock_release+0x133/0x1a0 [ 6155.452019] [] ? _raw_spin_unlock_irq+0x30/0x50 [ 6155.452019] [] ? trace_hardirqs_on_caller+0x105/0x190 [ 6155.452019] [] wait_for_common+0x12b/0x180 [ 6155.452019] [] ? try_to_wake_up+0x2f0/0x2f0 [ 6155.452019] [] wait_for_completion+0x1d/0x20 [ 6155.452019] [] stop_one_cpu+0x8a/0xc0 [ 6155.452019] [] ? __migrate_task+0x1a0/0x1a0 [ 6155.452019] [] ? complete+0x28/0x60 [ 6155.452019] [] set_cpus_allowed_ptr+0x128/0x130 [ 6155.452019] [] cmci_rediscover+0xf5/0x140 [ 6155.452019] [] mce_cpu_callback+0x18d/0x19d [ 6155.452019] [] notifier_call_chain+0x67/0x150 [ 6155.452019] [] __raw_notifier_call_chain+0xe/0x10 [ 6155.452019] [] __cpu_notify+0x20/0x40 [ 6155.452019] [] cpu_notify_nofail+0x15/0x30 [ 6155.452019] [] _cpu_down+0x262/0x2e0 [ 6155.452019] [] cpu_down+0x36/0x50 [ 6155.452019] [] acpi_processor_remove+0x50/0x11e [ 6155.452019] [] acpi_device_remove+0x90/0xb2 [ 6155.452019] [] __device_release_driver+0x7c/0xf0 [ 6155.452019] [] device_release_driver+0x2f/0x50 [ 6155.452019] [] acpi_bus_remove+0x32/0x6d [ 6155.452019] [] acpi_bus_trim+0x87/0xee [ 6155.452019] [] acpi_bus_hot_remove_device+0x88/0x16b [ 6155.452019] [] acpi_os_execute_deferred+0x27/0x34 [ 6155.452019] [] process_one_work+0x219/0x680 [ 6155.452019] [] ? process_one_work+0x1b8/0x680 [ 6155.452019] [] ? acpi_os_wait_events_complete+0x23/0x23 [ 6155.452019] [] worker_thread+0x12e/0x320 [ 6155.452019] [] ? manage_workers+0x110/0x110 [ 6155.452019] [] kthread+0xc6/0xd0 [ 6155.452019] [] kernel_thread_helper+0x4/0x10 [ 6155.452019] [] ? retint_restore_args+0x13/0x13 [ 6155.452019] [] ? __init_kthread_worker+0x70/0x70 [ 6155.452019] [] ? gs_change+0x13/0x13 This patch removes the set_cpus_allowed_ptr() call, and put the cmci rediscover jobs onto all the other cpus using system_wq. This could bring some delay for the jobs. Signed-off-by: Tang Chen Signed-off-by: Miao Xie Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 5f88abf07e9..4f9a3cbfc4a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -285,34 +285,39 @@ void cmci_clear(void) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } +static long cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); + + return 0; +} + /* * After a CPU went down cycle through all the others and rediscover * Must run in process context. */ void cmci_rediscover(int dying) { - int banks; - int cpu; - cpumask_var_t old; + int cpu, banks; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); for_each_online_cpu(cpu) { if (cpu == dying) continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) + + if (cpu == smp_processor_id()) { + cmci_rediscover_work_func(NULL); continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks); - } + } - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + work_on_cpu(cpu, cmci_rediscover_work_func, NULL); + } } /* -- cgit v1.2.3-70-g09d2 From 2bbf0a1427c377350f001fbc6260995334739ad7 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 31 Oct 2012 17:20:50 +0100 Subject: x86, amd: Disable way access filter on Piledriver CPUs The Way Access Filter in recent AMD CPUs may hurt the performance of some workloads, caused by aliasing issues in the L1 cache. This patch disables it on the affected CPUs. The issue is similar to that one of last year: http://lkml.indiana.edu/hypermail/linux/kernel/1107.3/00041.html This new patch does not replace the old one, we just need another quirk for newer CPUs. The performance penalty without the patch depends on the circumstances, but is a bit less than the last year's 3%. The workloads affected would be those that access code from the same physical page under different virtual addresses, so different processes using the same libraries with ASLR or multiple instances of PIE-binaries. The code needs to be accessed simultaneously from both cores of the same compute unit. More details can be found here: http://developer.amd.com/Assets/SharedL1InstructionCacheonAMD15hCPU.pdf CPUs affected are anything with the core known as Piledriver. That includes the new parts of the AMD A-Series (aka Trinity) and the just released new CPUs of the FX-Series (aka Vishera). The model numbering is a bit odd here: FX CPUs have model 2, A-Series has model 10h, with possible extensions to 1Fh. Hence the range of model ids. Signed-off-by: Andre Przywara Link: http://lkml.kernel.org/r/1351700450-9277-1-git-send-email-osp@andrep.de Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d1..1b7d1656a04 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -631,6 +631,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + u64 val; + + if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { + val |= 0x1E; + wrmsrl_safe(0xc0011021, val); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ -- cgit v1.2.3-70-g09d2 From 279f1461432ccdec0b98c0bcbe0a8e2c0f6fdda5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Oct 2012 14:37:58 -0700 Subject: x86: apic: Use tsc deadline for oneshot when available If the TSC deadline mode is supported, LAPIC timer one-shot mode can be implemented using IA32_TSC_DEADLINE MSR. An interrupt will be generated when the TSC value equals or exceeds the value in the IA32_TSC_DEADLINE MSR. This enables us to skip the APIC calibration during boot. Also, in xapic mode, this enables us to skip the uncached apic access to re-arm the APIC timer. As this timer ticks at the high frequency TSC rate, we use the TSC_DIVISOR (32) to work with the 32-bit restrictions in the clockevent API's to avoid 64-bit divides etc (frequency is u32 and "unsigned long" in the set_next_event(), max_delta limits the next event to 32-bit for 32-bit kernel). Signed-off-by: Suresh Siddha Cc: venki@google.com Cc: len.brown@intel.com Link: http://lkml.kernel.org/r/1350941878.6017.31.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- Documentation/kernel-parameters.txt | 4 ++ arch/x86/include/asm/msr-index.h | 2 + arch/x86/kernel/apic/apic.c | 73 +++++++++++++++++++++++++++---------- 3 files changed, 59 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9776f068306..4aa9ca0de63 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1304,6 +1304,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. lapic [X86-32,APIC] Enable the local APIC even if BIOS disabled it. + lapic= [x86,APIC] "notscdeadline" Do not use TSC deadline + value for LAPIC timer one-shot implementation. Default + back to the programmable timer unit in the LAPIC. + lapic_timer_c2_ok [X86,APIC] trust the local apic timer in C2 power state. diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 7f0edceb756..e400cdb2dd6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -337,6 +337,8 @@ #define MSR_IA32_MISC_ENABLE_TURBO_DISABLE (1ULL << 38) #define MSR_IA32_MISC_ENABLE_IP_PREF_DISABLE (1ULL << 39) +#define MSR_IA32_TSC_DEADLINE 0x000006E0 + /* P4/Xeon+ specific */ #define MSR_IA32_MCG_EAX 0x00000180 #define MSR_IA32_MCG_EBX 0x00000181 diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b17416e72fb..b994cc84aa7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -90,21 +90,6 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic __initdata; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; @@ -133,6 +118,25 @@ static inline void imcr_apic_to_pic(void) } #endif +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (config_enabled(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (!strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + #ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; static __init int setup_apicpmtimer(char *s) @@ -315,6 +319,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 +#define TSC_DIVISOR 32 /* * This function sets up the local APIC timer, with a timeout of @@ -333,6 +338,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + if (!lapic_is_integrated()) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -341,6 +349,11 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_LVTT, lvtt_value); + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); + return; + } + /* * Divide PICLK by 16 */ @@ -453,6 +466,16 @@ static int lapic_next_event(unsigned long delta, return 0; } +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + rdtscll(tsc); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + /* * Setup the lapic timer in periodic or oneshot mode */ @@ -533,7 +556,15 @@ static void __cpuinit setup_APIC_timer(void) memcpy(levt, &lapic_clockevent, sizeof(*levt)); levt->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(levt); + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + (tsc_khz / TSC_DIVISOR) * 1000, + 0xF, ~0UL); + } else + clockevents_register_device(levt); } /* @@ -661,7 +692,9 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ - if (lapic_timer_frequency) { + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + return 0; + } else if (lapic_timer_frequency) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", lapic_timer_frequency); lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, @@ -674,6 +707,9 @@ static int __init calibrate_APIC_clock(void) return 0; } + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + local_irq_disable(); /* Replace the global interrupt handler */ @@ -811,9 +847,6 @@ void __init setup_boot_APIC_clock(void) return; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) -- cgit v1.2.3-70-g09d2 From 28696f434fef0efa97534b59986ad33b9c4df7f8 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Mon, 1 Oct 2012 17:29:25 -0700 Subject: x86: Don't clobber top of pt_regs in nested NMI The nested NMI modifies the place (instruction, flags and stack) that the first NMI will iret to. However, the copy of registers modified is exactly the one that is the part of pt_regs in the first NMI. This can change the behaviour of the first NMI. In particular, Google's arch_trigger_all_cpu_backtrace handler also prints regions of memory surrounding addresses appearing in registers. This results in handled exceptions, after which nested NMIs start coming in. These nested NMIs change the value of registers in pt_regs. This can cause the original NMI handler to produce incorrect output. We solve this problem by interchanging the position of the preserved copy of the iret registers ("saved") and the copy subject to being trampled by nested NMI ("copied"). Link: http://lkml.kernel.org/r/20121002002919.27236.14388.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Salman Qazi [ Added a needed CFI_ADJUST_CFA_OFFSET ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51..811795db4fc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1699,9 +1699,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1709,8 +1710,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1736,18 +1737,18 @@ first_nmi: * +-------------------------+ * | NMI executing variable | * +-------------------------+ - * | Saved SS | - * | Saved Return RSP | - * | Saved RFLAGS | - * | Saved CS | - * | Saved RIP | - * +-------------------------+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ * | pt_regs | * +-------------------------+ * @@ -1763,9 +1764,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1786,12 +1792,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1842,8 +1851,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) -- cgit v1.2.3-70-g09d2 From 6acf5a8c931da9d26c8dd77d784daaf07fa2bff0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 14:02:40 +0000 Subject: x86: hpet: Fix masking of MSI interrupts HPET_TN_FSB is not a proper mask bit; it merely toggles between MSI and legacy interrupt delivery. The proper mask bit is HPET_TN_ENABLE, so use both bits when (un)masking the interrupt. Signed-off-by: Jan Beulich Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/5093E09002000078000A60E6@nat28.tlf.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1460a5df92f..e28670f9a58 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -434,7 +434,7 @@ void hpet_msi_unmask(struct irq_data *data) /* unmask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg |= HPET_TN_FSB; + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } @@ -445,7 +445,7 @@ void hpet_msi_mask(struct irq_data *data) /* mask it */ cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); - cfg &= ~HPET_TN_FSB; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); } -- cgit v1.2.3-70-g09d2 From 5074b85bdd3a464efe7b6de2ec163f4c07696a20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 14:00:29 +0000 Subject: x86: hpet: Fix inverted return value check in arch_setup_hpet_msi() setup_hpet_msi_remapped() returns a negative error indicator on error - check for this rather than for a boolean false indication, and pass on that error code rather than a meaningless "-1". Signed-off-by: Jan Beulich Cc: David Woodhouse Link: http://lkml.kernel.org/r/5093E00D02000078000A60E2@nat28.tlf.novell.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa91102..b134f0b7ed2 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3317,8 +3317,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) int ret; if (irq_remapping_enabled) { - if (!setup_hpet_msi_remapped(irq, id)) - return -1; + ret = setup_hpet_msi_remapped(irq, id); + if (ret) + return ret; } ret = msi_compose_msg(NULL, irq, &msg, id); -- cgit v1.2.3-70-g09d2 From 4dc316c64594d1a5ef2d61fba5ae0fe7fe18cdca Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 28 Oct 2012 17:57:30 +0100 Subject: uprobes/x86: Cleanup the single-stepping code No functional changes. Now that default arch_uprobe_enable/disable_step() helpers do nothing, x86 has no reason to reimplement them. Change arch_uprobe_*_xol() hooks to do the necessary work and remove the x86-specific hooks. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 54 +++++++++++++++++------------------------------ 1 file changed, 19 insertions(+), 35 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aafa5557b39..c71025b6746 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,11 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->ip = current->utask->xol_vaddr; pre_xol_rip_insn(auprobe, regs, autask); + autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + return 0; } @@ -603,6 +608,16 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (auprobe->fixups & UPROBE_FIX_CALL) result = adjust_ret_addr(regs->sp, correction); + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (utask->autask.saved_tf) + send_sig(SIGTRAP, current, 0); + else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + regs->flags &= ~X86_EFLAGS_TF; + return result; } @@ -647,6 +662,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) current->thread.trap_nr = utask->autask.saved_trap_nr; handle_riprel_post_xol(auprobe, regs, NULL); instruction_pointer_set(regs, utask->vaddr); + + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; } /* @@ -676,38 +695,3 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) send_sig(SIGTRAP, current, 0); return ret; } - -void arch_uprobe_enable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - struct pt_regs *regs = task_pt_regs(task); - - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); - - regs->flags |= X86_EFLAGS_TF; - if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) - set_task_blockstep(task, false); -} - -void arch_uprobe_disable_step(struct arch_uprobe *auprobe) -{ - struct task_struct *task = current; - struct arch_uprobe_task *autask = &task->utask->autask; - bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); - struct pt_regs *regs = task_pt_regs(task); - /* - * The state of TIF_BLOCKSTEP was not saved so we can get an extra - * SIGTRAP if we do not clear TF. We need to examine the opcode to - * make it right. - */ - if (unlikely(trapped)) { - if (!autask->saved_tf) - regs->flags &= ~X86_EFLAGS_TF; - } else { - if (autask->saved_tf) - send_sig(SIGTRAP, task, 0); - else if (!(auprobe->fixups & UPROBE_FIX_SETF)) - regs->flags &= ~X86_EFLAGS_TF; - } -} -- cgit v1.2.3-70-g09d2 From 193f3fcb3ab769bab4a2b9fa181eef3e5699a352 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Oct 2012 10:58:13 +0200 Subject: x86: Add cpu_has_topoext Introduce cpu_has_topoext to check for AMD's CPUID topology extensions support. It indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX See AMD's CPUID Specification, Publication # 25481 (as of Rev. 2.34 September 2010) Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20121019085813.GD26718@alberich Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8c297aa53ee..c22a492daf5 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -311,6 +311,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) +#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f7e98a2c0d1..64e9ad4e49a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -304,7 +304,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528..732bf5cff64 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -313,7 +313,7 @@ do { \ static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && -- cgit v1.2.3-70-g09d2 From 04a1541828ea223169eb44a336bfad8ec0dfb46a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Oct 2012 10:59:33 +0200 Subject: x86, cacheinfo: Determine number of cache leafs using CPUID 0x8000001d on AMD CPUID 0x8000001d works quite similar to Intels' CPUID function 4. Use it to determine number of cache leafs. Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20121019085933.GE26718@alberich Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/amd.c | 7 +------ arch/x86/kernel/cpu/intel_cacheinfo.c | 28 +++++++++++++++++++++++----- 3 files changed, 25 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad1fc851167..db0d8c32090 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -187,7 +187,7 @@ extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; +extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); extern void detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 64e9ad4e49a..a8538e6d2ff 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -643,12 +643,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) detect_ht(c); #endif - if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + init_amd_cacheinfo(c); if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 93c5451bdd5..8ce7a83252f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -557,21 +557,39 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(void) +static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; int i = -1; + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + do { ++i; - /* Do cpuid(4) loop to find out num_cache_leaves */ - cpuid_count(4, i, &eax, &ebx, &ecx, &edx); + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CACHE_TYPE_NULL); return i; } +void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (cpu_has_topoext) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -588,7 +606,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (is_initialized == 0) { /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(); + num_cache_leaves = find_num_cache_leaves(c); is_initialized++; } -- cgit v1.2.3-70-g09d2 From 2e8458dfe4202df75543402c7343b8f94de4101e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Oct 2012 11:00:49 +0200 Subject: x86, cacheinfo: Make use of CPUID 0x8000001d for cache information on AMD Rely on CPUID 0x8000001d for cache information when AMD CPUID topology extensions are available. Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20121019090049.GF26718@alberich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 8ce7a83252f..cd2e1ccce59 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -538,7 +538,11 @@ __cpuinit cpuid4_cache_lookup_regs(int index, unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - amd_cpuid4(index, &eax, &ebx, &ecx); + if (cpu_has_topoext) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); -- cgit v1.2.3-70-g09d2 From 27d3a8a26ada7660116fdd6830096008c063ee96 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 19 Oct 2012 11:02:09 +0200 Subject: x86, cacheinfo: Base cache sharing info on CPUID 0x8000001d on AMD The patch is based on a patch submitted by Hans Rosenfeld. See http://marc.info/?l=linux-kernel&m=133908777200931 Note that CPUID Fn8000_001D_EAX slightly differs to Intel's CPUID function 4. Bits 14-25 contain NumSharingCache. Actual number of cores sharing this cache. SW to add value of one to get result. The corresponding bits on Intel are defined as "maximum number of threads sharing this cache" (with a "plus 1" encoding). Thus a different method to determine which cores are sharing a cache level has to be used. Signed-off-by: Andreas Herrmann Link: http://lkml.kernel.org/r/20121019090209.GG26718@alberich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 41 +++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index cd2e1ccce59..fe9edec6698 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -750,37 +750,50 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf; - int ret, i, sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); + int i, sibling; - ret = 0; - if (index == 3) { - ret = 1; - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + if (cpu_has_topoext) { + unsigned int apicid, nshared, first, last; + + if (!per_cpu(ici_cpuid4_info, cpu)) + return 0; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { - ret = 1; - for_each_cpu(i, cpu_sibling_mask(cpu)) { + } else if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - } + } else + return 0; - return ret; + return 1; } static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) -- cgit v1.2.3-70-g09d2 From 8cbd9cc6254065c97c4bac42daa55ba1abe73a8e Mon Sep 17 00:00:00 2001 From: David Sharp Date: Tue, 13 Nov 2012 12:18:21 -0800 Subject: tracing,x86: Add a TSC trace_clock In order to promote interoperability between userspace tracers and ftrace, add a trace_clock that reports raw TSC values which will then be recorded in the ring buffer. Userspace tracers that also record TSCs are then on exactly the same time base as the kernel and events can be unambiguously interlaced. Tested: Enabled a tracepoint and the "tsc" trace_clock and saw very large timestamp values. v2: Move arch-specific bits out of generic code. v3: Rename "x86-tsc", cleanups v7: Generic arch bits in Kbuild. Google-Bug-Id: 6980623 Link: http://lkml.kernel.org/r/1352837903-32191-1-git-send-email-dhsharp@google.com Acked-by: Ingo Molnar Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: David Sharp Signed-off-by: Steven Rostedt --- arch/alpha/include/asm/Kbuild | 1 + arch/arm/include/asm/Kbuild | 1 + arch/arm64/include/asm/Kbuild | 1 + arch/avr32/include/asm/Kbuild | 1 + arch/blackfin/include/asm/Kbuild | 1 + arch/c6x/include/asm/Kbuild | 1 + arch/cris/include/asm/Kbuild | 1 + arch/frv/include/asm/Kbuild | 1 + arch/h8300/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/ia64/include/asm/Kbuild | 1 + arch/m32r/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/mn10300/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + arch/parisc/include/asm/Kbuild | 1 + arch/powerpc/include/asm/Kbuild | 1 + arch/s390/include/asm/Kbuild | 1 + arch/score/include/asm/Kbuild | 1 + arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/tile/include/asm/Kbuild | 1 + arch/um/include/asm/Kbuild | 1 + arch/unicore32/include/asm/Kbuild | 1 + arch/x86/include/asm/trace_clock.h | 20 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/trace_clock.c | 21 +++++++++++++++++++++ arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/trace_clock.h | 16 ++++++++++++++++ include/linux/trace_clock.h | 2 ++ kernel/trace/trace.c | 1 + 33 files changed, 88 insertions(+) create mode 100644 arch/x86/include/asm/trace_clock.h create mode 100644 arch/x86/kernel/trace_clock.c create mode 100644 include/asm-generic/trace_clock.h (limited to 'arch/x86/kernel') diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 64ffc9e9e54..dcfabb9f05a 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -11,3 +11,4 @@ header-y += reg.h header-y += regdef.h header-y += sysinfo.h generic-y += exec.h +generic-y += trace_clock.h diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index f70ae175a3d..514e398f1a0 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -31,5 +31,6 @@ generic-y += sockios.h generic-y += termbits.h generic-y += termios.h generic-y += timex.h +generic-y += trace_clock.h generic-y += types.h generic-y += unaligned.h diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index a581a220593..6e9ca462127 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -43,6 +43,7 @@ generic-y += swab.h generic-y += termbits.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += unaligned.h generic-y += user.h diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 4807ded352c..4dd4f78d3dc 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild @@ -1,3 +1,4 @@ generic-y += clkdev.h generic-y += exec.h +generic-y += trace_clock.h diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 5a0625aad6a..27d70759474 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -38,6 +38,7 @@ generic-y += statfs.h generic-y += termbits.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += ucontext.h generic-y += unaligned.h diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 112a496d835..eae7b5963e8 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -49,6 +49,7 @@ generic-y += termbits.h generic-y += termios.h generic-y += tlbflush.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += ucontext.h generic-y += user.h diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 6d43a951b5e..15a122c3767 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -11,3 +11,4 @@ header-y += sync_serial.h generic-y += clkdev.h generic-y += exec.h generic-y += module.h +generic-y += trace_clock.h diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index 4a159da2363..c5d76702830 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -1,3 +1,4 @@ generic-y += clkdev.h generic-y += exec.h +generic-y += trace_clock.h diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 50bbf387b2f..4bc8ae73e08 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -3,3 +3,4 @@ include include/asm-generic/Kbuild.asm generic-y += clkdev.h generic-y += exec.h generic-y += module.h +generic-y += trace_clock.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 3bfa9b30f44..bdb54ceb53b 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -48,6 +48,7 @@ generic-y += stat.h generic-y += termbits.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += ucontext.h generic-y += unaligned.h diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index dd02f09b6ed..05b03ecd793 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += kvm_para.h +generic-y += trace_clock.h diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 50bbf387b2f..4bc8ae73e08 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -3,3 +3,4 @@ include include/asm-generic/Kbuild.asm generic-y += clkdev.h generic-y += exec.h generic-y += module.h +generic-y += trace_clock.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 88fa3ac86fa..7f1949c0e08 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += sections.h generic-y += siginfo.h generic-y += statfs.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += word-at-a-time.h generic-y += xor.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 8653072d7e9..2957fcc7176 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -3,3 +3,4 @@ include include/asm-generic/Kbuild.asm header-y += elf.h generic-y += clkdev.h generic-y += exec.h +generic-y += trace_clock.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 533053d12ce..9b54b7a403d 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -1 +1,2 @@ # MIPS headers +generic-y += trace_clock.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 4a159da2363..c5d76702830 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -1,3 +1,4 @@ generic-y += clkdev.h generic-y += exec.h +generic-y += trace_clock.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 78de6805268..8971026e1c6 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -60,6 +60,7 @@ generic-y += swab.h generic-y += termbits.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += ucontext.h generic-y += user.h diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index bac8debecff..ff4c9faed54 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \ segment.h topology.h vga.h device.h percpu.h hw_irq.h mutex.h \ div64.h irq_regs.h kdebug.h kvm_para.h local64.h local.h param.h \ poll.h xor.h clkdev.h exec.h +generic-y += trace_clock.h diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index a4fe15e33c6..2d62b484b3f 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += rwsem.h +generic-y += trace_clock.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 0633dc6d254..f313f9cbcf4 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,3 +1,4 @@ generic-y += clkdev.h +generic-y += trace_clock.h diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index ec697aeefd0..16e41fe1a41 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -3,3 +3,4 @@ include include/asm-generic/Kbuild.asm header-y += generic-y += clkdev.h +generic-y += trace_clock.h diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 29f83beeef7..280bea9e5e2 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -31,5 +31,6 @@ generic-y += socket.h generic-y += statfs.h generic-y += termbits.h generic-y += termios.h +generic-y += trace_clock.h generic-y += ucontext.h generic-y += xor.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 645a58da0e8..e26d430ce2f 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -8,4 +8,5 @@ generic-y += local64.h generic-y += irq_regs.h generic-y += local.h generic-y += module.h +generic-y += trace_clock.h generic-y += word-at-a-time.h diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 6948015e08a..b17b9b8e53c 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -34,5 +34,6 @@ generic-y += sockios.h generic-y += statfs.h generic-y += termbits.h generic-y += termios.h +generic-y += trace_clock.h generic-y += types.h generic-y += xor.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 0f6e7b32826..b30f34a7988 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += bug.h cputime.h device.h emergency-restart.h futex.h hardirq.h generic-y += hw_irq.h irq_regs.h kdebug.h percpu.h sections.h topology.h xor.h generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h generic-y += switch_to.h clkdev.h +generic-y += trace_clock.h diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index c910c9857e1..7be503e4569 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -54,6 +54,7 @@ generic-y += syscalls.h generic-y += termbits.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += types.h generic-y += ucontext.h generic-y += unaligned.h diff --git a/arch/x86/include/asm/trace_clock.h b/arch/x86/include/asm/trace_clock.h new file mode 100644 index 00000000000..5c1652728b6 --- /dev/null +++ b/arch/x86/include/asm/trace_clock.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +#include +#include + +#ifdef CONFIG_X86_TSC + +extern u64 notrace trace_clock_x86_tsc(void); + +# define ARCH_TRACE_CLOCKS \ + { trace_clock_x86_tsc, "x86-tsc" }, + +#else /* !CONFIG_X86_TSC */ + +#define ARCH_TRACE_CLOCKS + +#endif + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9fd5eed3f8f..34e923a5376 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 00000000000..25b993729f9 --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include +#include +#include + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 6d130278999..095f0a2244f 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -25,4 +25,5 @@ generic-y += siginfo.h generic-y += statfs.h generic-y += termios.h generic-y += topology.h +generic-y += trace_clock.h generic-y += xor.h diff --git a/include/asm-generic/trace_clock.h b/include/asm-generic/trace_clock.h new file mode 100644 index 00000000000..6726f1bafb5 --- /dev/null +++ b/include/asm-generic/trace_clock.h @@ -0,0 +1,16 @@ +#ifndef _ASM_GENERIC_TRACE_CLOCK_H +#define _ASM_GENERIC_TRACE_CLOCK_H +/* + * Arch-specific trace clocks. + */ + +/* + * Additional trace clocks added to the trace_clocks + * array in kernel/trace/trace.c + * None if the architecture has not defined it. + */ +#ifndef ARCH_TRACE_CLOCKS +# define ARCH_TRACE_CLOCKS +#endif + +#endif /* _ASM_GENERIC_TRACE_CLOCK_H */ diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h index 4eb490237d4..d563f37e1a1 100644 --- a/include/linux/trace_clock.h +++ b/include/linux/trace_clock.h @@ -12,6 +12,8 @@ #include #include +#include + extern u64 notrace trace_clock_local(void); extern u64 notrace trace_clock(void); extern u64 notrace trace_clock_global(void); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c1434b5ce4d..0d20620c0d2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -488,6 +488,7 @@ static struct { { trace_clock_local, "local" }, { trace_clock_global, "global" }, { trace_clock_counter, "counter" }, + ARCH_TRACE_CLOCKS }; int trace_clock_id; -- cgit v1.2.3-70-g09d2 From 4d25031a81d3cd32edc00de6596db76cc4010685 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:40 -0800 Subject: x86, topology: Don't offline CPU0 if any PIC irq can not be migrated out of it If CONFIG_BOOTPARAM_HOTPLUG_CPU is turned on, CPU0 hotplug feature is enabled by default. If CONFIG_BOOTPARAM_HOTPLUG_CPU is not turned on, CPU0 hotplug feature is not enabled by default. The kernel parameter cpu0_hotplug can enable CPU0 hotplug feature at boot. Currently the feature is supported on Intel platforms only. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-4-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/topology.c | 50 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 76ee97709a0..0e7b4a7a7fb 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -30,23 +30,59 @@ #include #include #include +#include #include static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); #ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 +static int cpu0_hotpluggable = 1; +#else +static int cpu0_hotpluggable; +static int __init enable_cpu0_hotplug(char *str) +{ + cpu0_hotpluggable = 1; + return 1; +} + +__setup("cpu0_hotplug", enable_cpu0_hotplug); +#endif + int __ref arch_register_cpu(int num) { + struct cpuinfo_x86 *c = &cpu_data(num); + + /* + * Currently CPU0 is only hotpluggable on Intel platforms. Other + * vendors can add hotplug support later. + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + cpu0_hotpluggable = 0; + /* - * CPU0 cannot be offlined due to several - * restrictions and assumptions in kernel. This basically - * doesn't add a control file, one cannot attempt to offline - * BSP. + * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate + * depends on BSP. PIC interrupts depend on BSP. * - * Also certain PCI quirks require not to enable hotplug control - * for all CPU's. + * If the BSP depencies are under control, one can tell kernel to + * enable BSP hotplug. This basically adds a control file and + * one can attempt to offline BSP. */ - if (num) + if (num == 0 && cpu0_hotpluggable) { + unsigned int irq; + /* + * We won't take down the boot processor on i386 if some + * interrupts only are able to be serviced by the BSP in PIC. + */ + for_each_active_irq(irq) { + if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { + cpu0_hotpluggable = 0; + break; + } + } + } + if (num || cpu0_hotpluggable) per_cpu(cpu_devices, num).cpu.hotpluggable = 1; return register_cpu(&per_cpu(cpu_devices, num).cpu, num); -- cgit v1.2.3-70-g09d2 From 30106c174311b8cfaaa3186c7f6f9c36c62d17da Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:41 -0800 Subject: x86, hotplug: Support functions for CPU0 online/offline Add smp_store_boot_cpu_info() to store cpu info for BSP during boot time. Now smp_store_cpu_info() stores cpu info for bringing up BSP or AP after it's offline. Continue to online CPU0 in native_cpu_up(). Continue to offline CPU0 in native_cpu_disable(). Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-5-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 38 ++++++++++++++++++-------------------- 2 files changed, 19 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4f19a152603..b073aaea747 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -166,6 +166,7 @@ void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); +void smp_store_boot_cpu_info(void); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528..c297907f3c7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -125,8 +125,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; /* - * Report back to the Boot Processor. - * Running on AP. + * Report back to the Boot Processor during boot time or to the caller processor + * during CPU online. */ static void __cpuinit smp_callin(void) { @@ -279,19 +279,30 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } +void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; +} + /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU */ - void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); *c = boot_cpu_data; c->cpu_index = id; - if (id != 0) - identify_secondary_cpu(c); + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up AP or offlined CPU0. + */ + identify_secondary_cpu(c); } static bool __cpuinit @@ -795,7 +806,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); - if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || + if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { pr_err("%s: bad cpu %d\n", __func__, cpu); @@ -990,7 +1001,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) /* * Setup boot CPU information */ - smp_store_cpu_info(0); /* Final full version of the data */ + smp_store_boot_cpu_info(); /* Final full version of the data */ cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); @@ -1214,19 +1225,6 @@ void cpu_disable_common(void) int native_cpu_disable(void) { - int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - clear_local_APIC(); cpu_disable_common(); -- cgit v1.2.3-70-g09d2 From 42e78e9719aa0c76711e2731b19c90fe5ae05278 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:44 -0800 Subject: x86-64, hotplug: Add start_cpu0() entry point to head_64.S start_cpu0() is defined in head_64.S for 64-bit. The function sets up stack and jumps to start_secondary() for CPU0 wake up. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-8-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 94bf9cc2c7e..980053c4b9c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -252,6 +252,22 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movq stack_start(%rip),%rsp + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +ENDPROC(start_cpu0) +#endif + /* SMP bootup changes these two */ __REFDATA .align 8 -- cgit v1.2.3-70-g09d2 From 3e2a0cc3cdc19e0518ae87583add40ea1bf55b67 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:45 -0800 Subject: x86-32, hotplug: Add start_cpu0() entry point to head_32.S start_cpu0() is defined in head_32.S for 32-bit. The function sets up stack and jumps to start_secondary() for CPU0 wake up. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-9-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64..a013e7390ab 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -266,6 +266,19 @@ num_subarch_entries = (. - subarch_entries) / 4 jmp default_entry #endif /* CONFIG_PARAVIRT */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movl stack_start, %ecx + movl %ecx, %esp + jmp *(initial_code) +ENDPROC(start_cpu0) +#endif + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but -- cgit v1.2.3-70-g09d2 From 1bad2f19f7f79d1ec9e6c48168fd7ce8dc1c305f Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 26 Oct 2012 13:39:15 +0200 Subject: ACPI / Sleep: add acpi_sleep=nonvs_s3 parameter The ACPI specificiation would like us to save NVS at hibernation time, but makes no mention of saving NVS over S3. Not all versions of Windows do this either, and it is clear that not all machines need NVS saved/restored over S3. Allow the user to improve their suspend/resume time by disabling the NVS save/restore at S3 time, but continue to do the NVS save/restore for S4 as specified. Signed-off-by: Kristen Carlson Accardi Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/sleep.c | 2 ++ drivers/acpi/sleep.c | 17 ++++++++++++++++- include/linux/acpi.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf65ae..d5e0d717005 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,6 +101,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index fdcdbb65291..8640782944c 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -97,6 +97,21 @@ void __init acpi_nvs_nosave(void) nvs_nosave = true; } +/* + * The ACPI specification wants us to save NVS memory regions during hibernation + * but says nothing about saving NVS during S3. Not all versions of Windows + * save NVS on S3 suspend either, and it is clear that not all systems need + * NVS to be saved at S3 time. To improve suspend/resume time, allow the + * user to disable saving NVS on S3 if their system does not require it, but + * continue to save/restore NVS for S4 as specified. + */ +static bool nvs_nosave_s3; + +void __init acpi_nvs_nosave_s3(void) +{ + nvs_nosave_s3 = true; +} + /* * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the * user to request that behavior by using the 'acpi_old_suspend_ordering' @@ -243,7 +258,7 @@ static int acpi_suspend_begin(suspend_state_t pm_state) u32 acpi_state = acpi_suspend_states[pm_state]; int error = 0; - error = nvs_nosave ? 0 : suspend_nvs_alloc(); + error = (nvs_nosave || nvs_nosave_s3) ? 0 : suspend_nvs_alloc(); if (error) return error; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 90be9898110..3cf93491125 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -261,6 +261,7 @@ int acpi_resources_are_enforced(void); void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); +void __init acpi_nvs_nosave_s3(void); #endif /* CONFIG_PM_SLEEP */ struct acpi_osc_context { -- cgit v1.2.3-70-g09d2 From 35e92b78c1d327b1624e94d1c9c65ea7065d6b95 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 31 Oct 2012 22:44:48 +0100 Subject: ACPI / x86: Export acpi_[un]register_gsi() These functions might be called from modules as well so make sure they are exported. In addition, implement empty version of acpi_unregister_gsi() and remove the one from pci_irq.c. Signed-off-by: Andy Shevchenko Signed-off-by: Mika Westerberg Acked-by: Greg Kroah-Hartman Acked-by: H. Peter Anvin Acked-by: Tony Luck Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/boot.c | 6 ++++++ drivers/acpi/pci_irq.c | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index e651f7a589a..e48cafcf92a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -574,6 +574,12 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) return irq; } +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); void __init acpi_set_irq_model_pic(void) { diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 0eefa12e648..1be25a590dc 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -495,11 +495,6 @@ int acpi_pci_irq_enable(struct pci_dev *dev) return 0; } -/* FIXME: implement x86/x86_64 version */ -void __attribute__ ((weak)) acpi_unregister_gsi(u32 i) -{ -} - void acpi_pci_irq_disable(struct pci_dev *dev) { struct acpi_prt_entry *entry; -- cgit v1.2.3-70-g09d2 From e1c467e69040c3be68959332959c07fb3d818e87 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 14 Nov 2012 04:36:53 -0800 Subject: x86, hotplug: Wake up CPU0 via NMI instead of INIT, SIPI, SIPI Instead of waiting for STARTUP after INITs, BSP will execute the BIOS boot-strap code which is not a desired behavior for waking up BSP. To avoid the boot-strap code, wake up CPU0 by NMI instead. This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined (i.e. physically hot removed and then hot added), NMI won't wake it up. We'll change this code in the future to wake up hard offlined CPU0 if real platform and request are available. AP is still waken up as before by INIT, SIPI, SIPI sequence. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352896613-25957-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpu.h | 1 + arch/x86/kernel/smpboot.c | 111 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 105 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 4564c8e28a3..a1195726e8c 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -28,6 +28,7 @@ struct x86_cpu { #ifdef CONFIG_HOTPLUG_CPU extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); +extern void __cpuinit start_cpu0(void); #endif DECLARE_PER_CPU(int, cpu_state); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c297907f3c7..ef53e667e05 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -138,15 +138,17 @@ static void __cpuinit smp_callin(void) * we may get here before an INIT-deassert IPI reaches * our local APIC. We have to wait for the IPI or we'll * lock up on an APIC access. + * + * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. */ - if (apic->wait_for_init_deassert) + cpuid = smp_processor_id(); + if (apic->wait_for_init_deassert && cpuid != 0) apic->wait_for_init_deassert(&init_deasserted); /* * (This works even if the APIC is not enabled.) */ phys_id = read_apic_id(); - cpuid = smp_processor_id(); if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); @@ -228,6 +230,8 @@ static void __cpuinit smp_callin(void) cpumask_set_cpu(cpuid, cpu_callin_mask); } +static int cpu0_logical_apicid; +static int enable_start_cpu0; /* * Activate a secondary processor. */ @@ -243,6 +247,8 @@ notrace static void __cpuinit start_secondary(void *unused) preempt_disable(); smp_callin(); + enable_start_cpu0 = 0; + #ifdef CONFIG_X86_32 /* switch away from the initial page table */ load_cr3(swapper_pg_dir); @@ -492,7 +498,7 @@ void __inquire_remote_apic(int apicid) * won't ... remember to clear down the APIC, etc later. */ int __cpuinit -wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) +wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { unsigned long send_status, accept_status = 0; int maxlvt; @@ -500,7 +506,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid); + apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -660,6 +666,63 @@ static void __cpuinit announce_cpu(int cpu, int apicid) node, cpu, apicid); } +static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) + return NMI_HANDLED; + + return NMI_DONE; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + * + * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS + * boot-strap code which is not a desired behavior for waking up BSP. To + * void the boot-strap code, wake up CPU0 by NMI instead. + * + * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined + * (i.e. physically hot removed and then hot added), NMI won't wake it up. + * We'll change this code in the future to wake up hard offlined CPU0 if + * real platform and request are available. + */ +static int __cpuinit +wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, + int *cpu0_nmi_registered) +{ + int id; + int boot_error; + + /* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ + if (cpu) + return wakeup_secondary_cpu_via_init(apicid, start_ip); + + /* + * Wake up BSP by nmi. + * + * Register a NMI handler to help wake up CPU0. + */ + boot_error = register_nmi_handler(NMI_LOCAL, + wakeup_cpu0_nmi, 0, "wake_cpu0"); + + if (!boot_error) { + enable_start_cpu0 = 1; + *cpu0_nmi_registered = 1; + if (apic->dest_logical == APIC_DEST_LOGICAL) + id = cpu0_logical_apicid; + else + id = apicid; + boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); + } + + return boot_error; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -675,6 +738,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long boot_error = 0; int timeout; + int cpu0_nmi_registered = 0; /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); @@ -722,13 +786,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) } /* - * Kick the secondary CPU. Use the method in the APIC driver - * if it's defined - or use an INIT boot APIC message otherwise: + * Wake up a CPU in difference cases: + * - Use the method in the APIC driver if it's defined + * Otherwise, + * - Use an INIT boot APIC message for APs or NMI for BSP. */ if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else - boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, + &cpu0_nmi_registered); if (!boot_error) { /* @@ -793,6 +860,13 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) */ smpboot_restore_warm_reset_vector(); } + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + return boot_error; } @@ -1037,6 +1111,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + /* * Enable IO APIC before setting up error vector */ @@ -1264,6 +1343,14 @@ void play_dead_common(void) local_irq_disable(); } +static bool wakeup_cpu0(void) +{ + if (smp_processor_id() == 0 && enable_start_cpu0) + return true; + + return false; +} + /* * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. @@ -1327,6 +1414,11 @@ static inline void mwait_play_dead(void) __monitor(mwait_ptr, 0, 0); mb(); __mwait(eax, 0); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } @@ -1337,6 +1429,11 @@ static inline void hlt_play_dead(void) while (1) { native_halt(); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); } } -- cgit v1.2.3-70-g09d2 From 27fd185f3d9e83c64b7a596881d7751a638c9683 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:47 -0800 Subject: x86, hotplug: During CPU0 online, enable x2apic, set_numa_node. Previously these functions were not run on the BSP (CPU 0, the boot processor) since the boot processor init would only be executed before this functionality was initialized. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-11-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7505f7b13e7..ca165ac6793 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1237,7 +1237,7 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && this_cpu_read(numa_node) == 0 && + if (this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif @@ -1269,8 +1269,7 @@ void __cpuinit cpu_init(void) barrier(); x86_configure_nx(); - if (cpu != 0) - enable_x2apic(); + enable_x2apic(); /* * set up and load the per-CPU TSS -- cgit v1.2.3-70-g09d2 From 30242aa6023b71325c6b8addac06faf544a85fd0 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:48 -0800 Subject: x86, hotplug: The first online processor saves the MTRR state Ask the first online CPU to save mtrr instead of asking BSP. BSP could be offline when mtrr_save_state() is called. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-12-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6b96110bb0c..e4c1a418453 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -695,11 +695,16 @@ void mtrr_ap_init(void) } /** - * Save current fixed-range MTRR state of the BSP + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); + int first_cpu; + + get_online_cpus(); + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); + put_online_cpus(); } void set_mtrr_aps_delayed_init(void) -- cgit v1.2.3-70-g09d2 From 8d966a04107e56993a051cd41ead0b4f23ba2414 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:49 -0800 Subject: x86, hotplug: Handle retrigger irq by the first available CPU The first cpu in irq cfg->domain is likely to be CPU 0 and may not be available when CPU 0 is offline. Instead of using CPU 0 to handle retriggered irq, we use first available CPU which is online and in this irq's domain. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-13-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1817fa91102..f78fc2b4deb 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2199,9 +2199,11 @@ static int ioapic_retrigger_irq(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; unsigned long flags; + int cpu; raw_spin_lock_irqsave(&vector_lock, flags); - apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; -- cgit v1.2.3-70-g09d2 From 6f5298c2139b06925037490367906f3d73955b86 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:50 -0800 Subject: x86/i387.c: Initialize thread xstate only on CPU0 only once init_thread_xstate() is only called once to avoid overriding xstate_size during boot time or during CPU hotplug. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-14-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 675a0501244..245a71db401 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -175,7 +175,11 @@ void __cpuinit fpu_init(void) cr0 |= X86_CR0_EM; write_cr0(cr0); - if (!smp_processor_id()) + /* + * init_thread_xstate is only called once to avoid overriding + * xstate_size during boot time or during CPU hotplug. + */ + if (xstate_size == 0) init_thread_xstate(); mxcsr_feature_mask_init(); -- cgit v1.2.3-70-g09d2 From a71c8bc5dfefbbf80ef90739791554ef7ea4401b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 13 Nov 2012 11:32:51 -0800 Subject: x86, topology: Debug CPU0 hotplug CONFIG_DEBUG_HOTPLUG_CPU0 is for debugging the CPU0 hotplug feature. The switch offlines CPU0 as soon as possible and boots userspace up with CPU0 offlined. User can online CPU0 back after boot time. The default value of the switch is off. To debug CPU0 hotplug, you need to enable CPU0 offline/online feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during compilation or giving cpu0_hotplug kernel parameter at boot. It's safe and early place to take down CPU0 after all hotplug notifiers are installed and SMP is booted. Please note that some applications or drivers, e.g. some versions of udevd, during boot time may put CPU0 online again in this CPU0 hotplug debug mode. In this debug mode, setup_local_APIC() may report a warning on max_loops<=0 when CPU0 is onlined back after boot time. This is because pending interrupt in IRR can not move to ISR. The warning is not CPU0 specfic and it can happen on other CPUs as well. It is harmless except the first CPU0 online takes a bit longer time. And so this debug mode is useful to expose this issue. I'll send a seperate patch to fix this generic warning issue. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1352835171-3958-15-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 15 ++++++++++++++ arch/x86/include/asm/cpu.h | 3 +++ arch/x86/kernel/topology.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/power/cpu.c | 38 ++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 036e89ab470..b6cfa5f6252 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1727,6 +1727,21 @@ config BOOTPARAM_HOTPLUG_CPU0 You still can enable the CPU0 hotplug feature at boot by kernel parameter cpu0_hotplug. +config DEBUG_HOTPLUG_CPU0 + def_bool n + prompt "Debug CPU0 hotplug" + depends on HOTPLUG_CPU && EXPERIMENTAL + ---help--- + Enabling this option offlines CPU0 (if CPU0 can be offlined) as + soon as possible and boots up userspace with CPU0 offlined. User + can online CPU0 back after boot time. + + To debug CPU0 hotplug, you need to enable CPU0 offline/online + feature by either turning on CONFIG_BOOTPARAM_HOTPLUG_CPU0 during + compilation or giving cpu0_hotplug kernel parameter at boot. + + If unsure, say N. + config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index a1195726e8c..5f9a1243190 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -29,6 +29,9 @@ struct x86_cpu { extern int arch_register_cpu(int num); extern void arch_unregister_cpu(int); extern void __cpuinit start_cpu0(void); +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +extern int _debug_hotplug_cpu(int cpu, int action); +#endif #endif DECLARE_PER_CPU(int, cpu_state); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0e7b4a7a7fb..6e60b5fe224 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -50,6 +50,57 @@ static int __init enable_cpu0_hotplug(char *str) __setup("cpu0_hotplug", enable_cpu0_hotplug); #endif +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +/* + * This function offlines a CPU as early as possible and allows userspace to + * boot up without the CPU. The CPU can be onlined back by user after boot. + * + * This is only called for debugging CPU offline/online feature. + */ +int __ref _debug_hotplug_cpu(int cpu, int action) +{ + struct device *dev = get_cpu_device(cpu); + int ret; + + if (!cpu_is_hotpluggable(cpu)) + return -EINVAL; + + cpu_hotplug_driver_lock(); + + switch (action) { + case 0: + ret = cpu_down(cpu); + if (!ret) { + pr_info("CPU %u is now offline\n", cpu); + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); + } else + pr_debug("Can't offline CPU%d.\n", cpu); + break; + case 1: + ret = cpu_up(cpu); + if (!ret) + kobject_uevent(&dev->kobj, KOBJ_ONLINE); + else + pr_debug("Can't online CPU%d.\n", cpu); + break; + default: + ret = -EINVAL; + } + + cpu_hotplug_driver_unlock(); + + return ret; +} + +static int __init debug_hotplug_cpu(void) +{ + _debug_hotplug_cpu(0, 0); + return 0; +} + +late_initcall_sync(debug_hotplug_cpu); +#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ + int __ref arch_register_cpu(int num) { struct cpuinfo_x86 *c = &cpu_data(num); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index adde77588e2..120cee1c3f8 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -21,6 +21,7 @@ #include #include #include /* pcntxt_mask */ +#include #ifdef CONFIG_X86_32 static struct saved_context saved_context; @@ -263,6 +264,43 @@ static int bsp_pm_callback(struct notifier_block *nb, unsigned long action, case PM_HIBERNATION_PREPARE: ret = bsp_check(); break; +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 + case PM_RESTORE_PREPARE: + /* + * When system resumes from hibernation, online CPU0 because + * 1. it's required for resume and + * 2. the CPU was online before hibernation + */ + if (!cpu_online(0)) + _debug_hotplug_cpu(0, 1); + break; + case PM_POST_RESTORE: + /* + * When a resume really happens, this code won't be called. + * + * This code is called only when user space hibernation software + * prepares for snapshot device during boot time. So we just + * call _debug_hotplug_cpu() to restore to CPU0's state prior to + * preparing the snapshot device. + * + * This works for normal boot case in our CPU0 hotplug debug + * mode, i.e. CPU0 is offline and user mode hibernation + * software initializes during boot time. + * + * If CPU0 is online and user application accesses snapshot + * device after boot time, this will offline CPU0 and user may + * see different CPU0 state before and after accessing + * the snapshot device. But hopefully this is not a case when + * user debugging CPU0 hotplug. Even if users hit this case, + * they can easily online CPU0 back. + * + * To simplify this debug code, we only consider normal boot + * case. Otherwise we need to remember CPU0's state and restore + * to that state and resolve racy conditions etc. + */ + _debug_hotplug_cpu(0, 0); + break; +#endif default: break; } -- cgit v1.2.3-70-g09d2 From 1022623842cb72ee4d0dbf02f6937f38c92c3f41 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Mon, 3 Sep 2012 20:54:48 +0200 Subject: x86-32: Fix invalid stack address while in softirq In 32 bit the stack address provided by kernel_stack_pointer() may point to an invalid range causing NULL pointer access or page faults while in NMI (see trace below). This happens if called in softirq context and if the stack is empty. The address at ®s->sp is then out of range. Fixing this by checking if regs and ®s->sp are in the same stack context. Otherwise return the previous stack pointer stored in struct thread_info. If that address is invalid too, return address of regs. BUG: unable to handle kernel NULL pointer dereference at 0000000a IP: [] print_context_stack+0x6e/0x8d *pde = 00000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 4434, comm: perl Not tainted 3.6.0-rc3-oprofile-i386-standard-g4411a05 #4 Hewlett-Packard HP xw9400 Workstation/0A1Ch EIP: 0060:[] EFLAGS: 00010093 CPU: 0 EIP is at print_context_stack+0x6e/0x8d EAX: ffffe000 EBX: 0000000a ECX: f4435f94 EDX: 0000000a ESI: f4435f94 EDI: f4435f94 EBP: f5409ec0 ESP: f5409ea0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 0000000a CR3: 34ac9000 CR4: 000007d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 Process perl (pid: 4434, ti=f5408000 task=f5637850 task.ti=f4434000) Stack: 000003e8 ffffe000 00001ffc f4e39b00 00000000 0000000a f4435f94 c155198c f5409ef0 c1003723 c155198c f5409f04 00000000 f5409edc 00000000 00000000 f5409ee8 f4435f94 f5409fc4 00000001 f5409f1c c12dce1c 00000000 c155198c Call Trace: [] dump_trace+0x7b/0xa1 [] x86_backtrace+0x40/0x88 [] ? oprofile_add_sample+0x56/0x84 [] oprofile_add_sample+0x75/0x84 [] op_amd_check_ctrs+0x46/0x260 [] profile_exceptions_notify+0x23/0x4c [] nmi_handle+0x31/0x4a [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] do_nmi+0xa0/0x2ff [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] nmi_stack_correct+0x28/0x2d [] ? ftrace_define_fields_irq_handler_entry+0x45/0x45 [] ? do_softirq+0x4b/0x7f [] irq_exit+0x35/0x5b [] smp_apic_timer_interrupt+0x6c/0x7a [] apic_timer_interrupt+0x2a/0x30 Code: 89 fe eb 08 31 c9 8b 45 0c ff 55 ec 83 c3 04 83 7d 10 00 74 0c 3b 5d 10 73 26 3b 5d e4 73 0c eb 1f 3b 5d f0 76 1a 3b 5d e8 73 15 <8b> 13 89 d0 89 55 e0 e8 ad 42 03 00 85 c0 8b 55 e0 75 a6 eb cc EIP: [] print_context_stack+0x6e/0x8d SS:ESP 0068:f5409ea0 CR2: 000000000000000a ---[ end trace 62afee3481b00012 ]--- Kernel panic - not syncing: Fatal exception in interrupt V2: * add comments to kernel_stack_pointer() * always return a valid stack address by falling back to the address of regs Reported-by: Yang Wei Cc: Signed-off-by: Robert Richter Link: http://lkml.kernel.org/r/20120912135059.GZ8285@erda.amd.com Signed-off-by: H. Peter Anvin Cc: Jun Zhang --- arch/x86/include/asm/ptrace.h | 15 ++++----------- arch/x86/kernel/ptrace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index dcfde52979c..19f16ebaf4f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -205,21 +205,14 @@ static inline bool user_64bit_mode(struct pt_regs *regs) } #endif -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * This is valid only for kernel mode traps. - */ -static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ #ifdef CONFIG_X86_32 - return (unsigned long)(®s->sp); +extern unsigned long kernel_stack_pointer(struct pt_regs *regs); #else +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ return regs->sp; -#endif } +#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b00b33a1839..2484e331a64 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -166,6 +166,34 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + struct thread_info *tinfo; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + tinfo = (struct thread_info *)context; + if (tinfo->previous_esp) + return tinfo->previous_esp; + + return (unsigned long)regs; +} + static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); -- cgit v1.2.3-70-g09d2 From cb57a2b4cff7edf2a4e32c0163200e9434807e0a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 20 Nov 2012 22:21:02 -0800 Subject: x86-32: Export kernel_stack_pointer() for modules Modules, in particular oprofile (and possibly other similar tools) need kernel_stack_pointer(), so export it using EXPORT_SYMBOL_GPL(). Cc: Yang Wei Cc: Robert Richter Cc: Jun Zhang Cc: Link: http://lkml.kernel.org/r/20120912135059.GZ8285@erda.amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/ptrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2484e331a64..5e0596b0632 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -193,6 +194,7 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return (unsigned long)regs; } +EXPORT_SYMBOL_GPL(kernel_stack_pointer); static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { -- cgit v1.2.3-70-g09d2 From 36c46ca4f322a7bf89aad5462a3a1f61713edce7 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 15 Nov 2012 13:41:50 -0500 Subject: x86, microcode, AMD: Add support for family 16h processors Add valid patch size for family 16h processors. [ hpa: promoting to urgent/stable since it is hw enabling and trivial ] Signed-off-by: Boris Ostrovsky Acked-by: Andreas Herrmann Link: http://lkml.kernel.org/r/1353004910-2204-1-git-send-email-boris.ostrovsky@amd.com Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/microcode_amd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index b3e67ba55b7..efdec7cd8e0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -190,6 +190,7 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 switch (c->x86) { case 0x14: @@ -198,6 +199,9 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size, case 0x15: max_size = F15H_MPB_MAX_SIZE; break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; -- cgit v1.2.3-70-g09d2 From ee4eb87be2c3f69c2c4d9f1c1d98e363a7ad18ab Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 2 Nov 2012 11:18:39 +0000 Subject: x86-64: Fix ordering of CFI directives and recent ASM_CLAC additions While these got added in the right place everywhere else, entry_64.S is the odd one where they ended up before the initial CFI directive(s). In order to cover the full code ranges, the CFI directive must be first, though. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/5093BA1F02000078000A600E@nat28.tlf.novell.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7ee51..1328fe49a3f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -995,8 +995,8 @@ END(interrupt) */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: - ASM_CLAC XCPT_FRAME + ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ /* 0(%rsp): old_rsp-ARGOFFSET */ @@ -1135,8 +1135,8 @@ END(common_interrupt) */ .macro apicinterrupt num sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC pushq_cfi $~(\num) .Lcommon_\sym: interrupt \do_sym @@ -1190,8 +1190,8 @@ apicinterrupt IRQ_WORK_VECTOR \ */ .macro zeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1208,8 +1208,8 @@ END(\sym) .macro paranoidzeroentry sym do_sym ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1227,8 +1227,8 @@ END(\sym) #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) - ASM_CLAC INTR_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp @@ -1247,8 +1247,8 @@ END(\sym) .macro errorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1266,8 +1266,8 @@ END(\sym) /* error code is on the stack already */ .macro paranoiderrorentry sym do_sym ENTRY(\sym) - ASM_CLAC XCPT_FRAME + ASM_CLAC PARAVIRT_ADJUST_EXCEPTION_FRAME subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -- cgit v1.2.3-70-g09d2 From 29c574c0aba8dc0736e19eb9b24aad28cc5c9098 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 26 Nov 2012 14:49:36 -0800 Subject: x86, apic: Cleanup cfg->domain setup for legacy interrupts Issues that need to be handled: * Handle PIC interrupts on any CPU irrespective of the apic mode * In the apic lowest priority logical flat delivery mode, be prepared to handle the interrupt on any CPU irrespective of what the IO-APIC RTE says. * Because of above, when the IO-APIC starts handling the legacy PIC interrupt, use the same vector that is being used by the PIC while programming the corresponding IO-APIC RTE. Start with all the cpu's in the legacy PIC interrupts cfg->domain. By the time IO-APIC starts taking over the PIC interrupts, apic driver model is finalized. So depend on the assign_irq_vector() to update the cfg->domain and retain the same vector that was used by PIC before. For the logical apic flat mode, cfg->domain is updated (during the first call to assign_irq_vector()) to contain all the possible online cpu's (0xff). Vector used for the legacy PIC interrupt doesn't change when the IO-APIC starts handling the interrupt. Any interrupt migration after that doesn't change the cfg->domain or the vector used. For other apic modes like physical mode, cfg->domain is updated (during the first call to assign_irq_vector()) to the boot cpu (cpu-0), with the same vector that is being used by the PIC. When that interrupt is migrated to a different cpu, cfg->domin and the vector assigned will change accordingly. Tested-by: Borislav Petkov Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/1353970176.21070.51.camel@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2c..0c1f3665056 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -234,11 +234,11 @@ int __init arch_early_irq_init(void) zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* * For legacy IRQ's, start with assigning irq0 to irq15 to - * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. */ if (i < legacy_pic->nr_legacy_irqs) { cfg[i].vector = IRQ0_VECTOR + i; - cpumask_set_cpu(0, cfg[i].domain); + cpumask_setall(cfg[i].domain); } } @@ -1141,7 +1141,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * allocation for the members that are not used anymore. */ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); - cfg->move_in_progress = 1; + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); cpumask_and(cfg->domain, cfg->domain, tmp_mask); break; } @@ -1172,8 +1173,9 @@ next: current_vector = vector; current_offset = offset; if (cfg->vector) { - cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); } for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; @@ -1241,12 +1243,6 @@ void __setup_vector_irq(int cpu) cfg = irq_get_chip_data(irq); if (!cfg) continue; - /* - * If it is a legacy IRQ handled by the legacy PIC, this cpu - * will be part of the irq_cfg's domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) - cpumask_set_cpu(cpu, cfg->domain); if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1356,16 +1352,6 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0. Now that IO-APIC - * can handle this irq and the apic driver is finialized at this point, - * update the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && - cpumask_equal(cfg->domain, cpumask_of(0))) - apic->vector_allocation_domain(0, cfg->domain, - apic->target_cpus()); - if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3-70-g09d2 From 6662c34fa9c60a48aaa5879cb229cd9a84de9c22 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 27 Nov 2012 08:54:36 -0800 Subject: x86-32: Unbreak booting on some 486 clones There appear to have been some 486 clones, including the "enhanced" version of Am486, which have CPUID but not CR4. These 486 clones had only the FPU flag, if any, unlike the Intel 486s with CPUID, which also had VME and therefore needed CR4. Therefore, look at the basic CPUID flags and require at least one bit other than bit 0 before we modify CR4. Thanks to Christian Ludloff of sandpile.org for confirming this as a problem. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 957a47aec64..4dac2f68ed4 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -292,8 +292,8 @@ default_entry: * be using the global pages. * * NOTE! If we are on a 486 we may have no cr4 at all! - * Specifically, cr4 exists if and only if CPUID exists, - * which in turn exists if and only if EFLAGS.ID exists. + * Specifically, cr4 exists if and only if CPUID exists + * and has flags other than the FPU flag set. */ movl $X86_EFLAGS_ID,%ecx pushl %ecx @@ -308,6 +308,11 @@ default_entry: testl %ecx,%eax jz 6f # No ID flag = no CPUID = no CR4 + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz 6f # No flags or only CPUID.FPU = no CR4 + movl pa(mmu_cr4_features),%eax movl %eax,%cr4 -- cgit v1.2.3-70-g09d2 From e5bb8ad862a97a0facc83f3b81731de919fec6ad Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Nov 2012 11:50:26 -0800 Subject: x86, 386 removal: Remove CONFIG_BSWAP All 486+ CPUs support BSWAP, so remove the fallback 386 support code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1354132230-21854-5-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig.cpu | 4 ---- arch/x86/include/asm/futex.h | 12 ------------ arch/x86/include/asm/swab.h | 29 ++--------------------------- arch/x86/kernel/cpu/bugs.c | 13 ++----------- 4 files changed, 4 insertions(+), 54 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 52955eeeb1e..8e5867cf07d 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -332,10 +332,6 @@ config X86_INVLPG def_bool y depends on X86_32 -config X86_BSWAP - def_bool y - depends on X86_32 - config X86_POPAD_OK def_bool y depends on X86_32 diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index f373046e63e..be27ba1e947 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -55,12 +55,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines can only support FUTEX_OP_SET */ - if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - pagefault_disable(); switch (op) { @@ -118,12 +112,6 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, { int ret = 0; -#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) - /* Real i386 machines have no cmpxchg instruction */ - if (boot_cpu_data.x86 == 3) - return -ENOSYS; -#endif - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; diff --git a/arch/x86/include/asm/swab.h b/arch/x86/include/asm/swab.h index 557cd9f0066..7f235c7105c 100644 --- a/arch/x86/include/asm/swab.h +++ b/arch/x86/include/asm/swab.h @@ -6,22 +6,7 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 val) { -#ifdef __i386__ -# ifdef CONFIG_X86_BSWAP - asm("bswap %0" : "=r" (val) : "0" (val)); -# else - asm("xchgb %b0,%h0\n\t" /* swap lower bytes */ - "rorl $16,%0\n\t" /* swap words */ - "xchgb %b0,%h0" /* swap higher bytes */ - : "=q" (val) - : "0" (val)); -# endif - -#else /* __i386__ */ - asm("bswapl %0" - : "=r" (val) - : "0" (val)); -#endif + asm("bswapl %0" : "=r" (val) : "0" (val)); return val; } #define __arch_swab32 __arch_swab32 @@ -37,22 +22,12 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 val) __u64 u; } v; v.u = val; -# ifdef CONFIG_X86_BSWAP asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1" : "=r" (v.s.a), "=r" (v.s.b) : "0" (v.s.a), "1" (v.s.b)); -# else - v.s.a = __arch_swab32(v.s.a); - v.s.b = __arch_swab32(v.s.b); - asm("xchgl %0,%1" - : "=r" (v.s.a), "=r" (v.s.b) - : "0" (v.s.a), "1" (v.s.b)); -# endif return v.u; #else /* __i386__ */ - asm("bswapq %0" - : "=r" (val) - : "0" (val)); + asm("bswapq %0" : "=r" (val) : "0" (val)); return val; #endif } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index d0e910da16c..0cd07ccdf38 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -136,24 +136,15 @@ static void __init check_popad(void) /* * Check whether we are able to run this kernel safely on SMP. * - * - In order to run on a i386, we need to be compiled for i386 - * (for due to lack of "invlpg" and working WP on a i386) + * - i386 is no longer supported. * - In order to run on anything without a TSC, we need to be * compiled for a i486. */ static void __init check_config(void) { -/* - * We'd better not be a i386 if we're configured to use some - * i486+ only features! (WP works in supervisor mode and the - * new "invlpg" and "bswap" instructions) - */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ - defined(CONFIG_X86_BSWAP) - if (boot_cpu_data.x86 == 3) + if (boot_cpu_data.x86 < 4) panic("Kernel requires i486+ for 'invlpg' and other features"); -#endif } -- cgit v1.2.3-70-g09d2 From 094ab1db7cb7833cd4c820acd868fc26acf3f08e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Nov 2012 11:50:27 -0800 Subject: x86, 386 removal: Remove CONFIG_INVLPG All 486+ CPUs support INVLPG, so remove the fallback 386 support code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1354132230-21854-6-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig.cpu | 4 ---- arch/x86/include/asm/cpufeature.h | 6 ------ arch/x86/include/asm/tlbflush.h | 3 --- arch/x86/kernel/cpu/amd.c | 3 --- arch/x86/kernel/cpu/intel.c | 4 ---- arch/x86/mm/tlb.c | 8 +++----- 6 files changed, 3 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8e5867cf07d..d3bdc18af1f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -328,10 +328,6 @@ config X86_INVD_BUG config X86_WP_WORKS_OK def_bool y -config X86_INVLPG - def_bool y - depends on X86_32 - config X86_POPAD_OK def_bool y depends on X86_32 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 8c297aa53ee..ff8dd62fda4 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -312,12 +312,6 @@ extern const char * const x86_power_flags[32]; #define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) #define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) -#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) -# define cpu_has_invlpg 1 -#else -# define cpu_has_invlpg (boot_cpu_data.x86 > 3) -#endif - #ifdef CONFIG_X86_64 #undef cpu_has_vme diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 74a44333545..0fee48e279c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -56,10 +56,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - if (cpu_has_invlpg) __flush_tlb_single(addr); - else - __flush_tlb(); } #define TLB_FLUSH_ALL -1UL diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1b7d1656a04..a025d8cc457 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -753,9 +753,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) - return; - tlb_flushall_shift = 5; if (c->x86 <= 0x11) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019a531..fcaabd0432c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -612,10 +612,6 @@ static void __cpuinit intel_tlb_lookup(const unsigned char desc) static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) { - if (!cpu_has_invlpg) { - tlb_flushall_shift = -1; - return; - } switch ((c->x86 << 8) + c->x86_model) { case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 60f926cd8b0..13a6b29e2e5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -104,7 +104,7 @@ static void flush_tlb_func(void *info) return; if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { - if (f->flush_end == TLB_FLUSH_ALL || !cpu_has_invlpg) + if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); else if (!f->flush_end) __flush_tlb_single(f->flush_start); @@ -337,10 +337,8 @@ static const struct file_operations fops_tlbflush = { static int __cpuinit create_tlb_flushall_shift(void) { - if (cpu_has_invlpg) { - debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, - arch_debugfs_dir, NULL, &fops_tlbflush); - } + debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); return 0; } late_initcall(create_tlb_flushall_shift); -- cgit v1.2.3-70-g09d2 From e3228cf4544355f73437a2b9c6916be9cbafc201 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 28 Nov 2012 11:50:29 -0800 Subject: x86, 386 removal: Remove CONFIG_X86_POPAD_OK The check_popad() routine tested for a 386-specific bug, and never actually did anything useful with it anyway other than print a message. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1354132230-21854-8-git-send-email-hpa@linux.intel.com --- arch/x86/Kconfig.cpu | 4 ---- arch/x86/kernel/cpu/bugs.c | 28 ---------------------------- 2 files changed, 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 159ee9c824c..423db7189eb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -325,10 +325,6 @@ config X86_INVD_BUG def_bool y depends on M486 -config X86_POPAD_OK - def_bool y - depends on X86_32 - config X86_ALIGNMENT_16 def_bool y depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0cd07ccdf38..92dfec986a4 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -106,33 +106,6 @@ static void __init check_hlt(void) pr_cont("OK\n"); } -/* - * Most 386 processors have a bug where a POPAD can lock the - * machine even from user space. - */ - -static void __init check_popad(void) -{ -#ifndef CONFIG_X86_POPAD_OK - int res, inp = (int) &res; - - pr_info("Checking for popad bug... "); - __asm__ __volatile__( - "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " - : "=&a" (res) - : "d" (inp) - : "ecx", "edi"); - /* - * If this fails, it means that any user program may lock the - * CPU hard. Too bad. - */ - if (res != 12345678) - pr_cont("Buggy\n"); - else - pr_cont("OK\n"); -#endif -} - /* * Check whether we are able to run this kernel safely on SMP. * @@ -157,7 +130,6 @@ void __init check_bugs(void) #endif check_config(); check_hlt(); - check_popad(); init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); -- cgit v1.2.3-70-g09d2 From 91d1aa43d30505b0b825db8898ffc80a8eca96c7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 27 Nov 2012 19:33:25 +0100 Subject: context_tracking: New context tracking susbsystem Create a new subsystem that probes on kernel boundaries to keep track of the transitions between level contexts with two basic initial contexts: user or kernel. This is an abstraction of some RCU code that use such tracking to implement its userspace extended quiescent state. We need to pull this up from RCU into this new level of indirection because this tracking is also going to be used to implement an "on demand" generic virtual cputime accounting. A necessary step to shutdown the tick while still accounting the cputime. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Li Zhong Cc: Gilad Ben-Yossef Reviewed-by: Steven Rostedt [ paulmck: fix whitespace error and email address. ] Signed-off-by: Paul E. McKenney --- arch/Kconfig | 15 +++--- arch/x86/Kconfig | 2 +- arch/x86/include/asm/context_tracking.h | 31 ++++++++++++ arch/x86/include/asm/rcu.h | 32 ------------- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/ptrace.c | 8 ++-- arch/x86/kernel/signal.c | 5 +- arch/x86/kernel/traps.c | 2 +- arch/x86/mm/fault.c | 2 +- include/linux/context_tracking.h | 18 +++++++ include/linux/rcupdate.h | 2 - init/Kconfig | 28 +++++------ kernel/Makefile | 1 + kernel/context_tracking.c | 83 +++++++++++++++++++++++++++++++++ kernel/rcutree.c | 64 +------------------------ kernel/sched/core.c | 11 +++-- 16 files changed, 174 insertions(+), 132 deletions(-) create mode 100644 arch/x86/include/asm/context_tracking.h delete mode 100644 arch/x86/include/asm/rcu.h create mode 100644 include/linux/context_tracking.h create mode 100644 kernel/context_tracking.c (limited to 'arch/x86/kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 366ec06a518..cc74aaea116 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -300,15 +300,16 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. -config HAVE_RCU_USER_QS +config HAVE_CONTEXT_TRACKING bool help - Provide kernel entry/exit hooks necessary for userspace - RCU extended quiescent state. Syscalls need to be wrapped inside - rcu_user_exit()-rcu_user_enter() through the slow path using - TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs - are already protected inside rcu_irq_enter/rcu_irq_exit() but - preemption or signal handling on irq exit still need to be protected. + Provide kernel/user boundaries probes necessary for subsystems + that need it, such as userspace RCU extended quiescent state. + Syscalls need to be wrapped inside user_exit()-user_enter() through + the slow path using TIF_NOHZ flag. Exceptions handlers must be + wrapped as well. Irqs are already protected inside + rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on + irq exit still need to be protected. config HAVE_VIRT_CPU_ACCOUNTING bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced..110cfad24f2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,7 +106,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select HAVE_RCU_USER_QS if X86_64 + select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING select GENERIC_KERNEL_THREAD select GENERIC_KERNEL_EXECVE diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h new file mode 100644 index 00000000000..1616562683e --- /dev/null +++ b/arch/x86/include/asm/context_tracking.h @@ -0,0 +1,31 @@ +#ifndef _ASM_X86_CONTEXT_TRACKING_H +#define _ASM_X86_CONTEXT_TRACKING_H + +#ifndef __ASSEMBLY__ +#include +#include + +static inline void exception_enter(struct pt_regs *regs) +{ + user_exit(); +} + +static inline void exception_exit(struct pt_regs *regs) +{ +#ifdef CONFIG_CONTEXT_TRACKING + if (user_mode(regs)) + user_enter(); +#endif +} + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_CONTEXT_TRACKING +# define SCHEDULE_USER call schedule_user +#else +# define SCHEDULE_USER call schedule +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h deleted file mode 100644 index d1ac07a2397..00000000000 --- a/arch/x86/include/asm/rcu.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _ASM_X86_RCU_H -#define _ASM_X86_RCU_H - -#ifndef __ASSEMBLY__ - -#include -#include - -static inline void exception_enter(struct pt_regs *regs) -{ - rcu_user_exit(); -} - -static inline void exception_exit(struct pt_regs *regs) -{ -#ifdef CONFIG_RCU_USER_QS - if (user_mode(regs)) - rcu_user_enter(); -#endif -} - -#else /* __ASSEMBLY__ */ - -#ifdef CONFIG_RCU_USER_QS -# define SCHEDULE_USER call schedule_user -#else -# define SCHEDULE_USER call schedule -#endif - -#endif /* !__ASSEMBLY__ */ - -#endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0c58952d64e..98faeb30139 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index eff5b8c6865..65b88a5dc1a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -1461,7 +1461,7 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; - rcu_user_exit(); + user_exit(); /* * If we stepped into a sysenter/syscall insn, it trapped in @@ -1516,7 +1516,7 @@ void syscall_trace_leave(struct pt_regs *regs) * or do_notify_resume(), in which case we can be in RCU * user mode. */ - rcu_user_exit(); + user_exit(); audit_syscall_exit(regs); @@ -1534,5 +1534,5 @@ void syscall_trace_leave(struct pt_regs *regs) if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); - rcu_user_enter(); + user_enter(); } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 29ad351804e..20ecac112e7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -816,7 +817,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - rcu_user_exit(); + user_exit(); #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ @@ -840,7 +841,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - rcu_user_enter(); + user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8276dc6794c..eb8586693e0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e13ecb41be..7a529cbab7a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,7 @@ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ #include /* VSYSCALL_START */ -#include /* exception_enter(), ... */ +#include /* exception_enter(), ... */ /* * Page fault error code bits: diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h new file mode 100644 index 00000000000..e24339ccb7f --- /dev/null +++ b/include/linux/context_tracking.h @@ -0,0 +1,18 @@ +#ifndef _LINUX_CONTEXT_TRACKING_H +#define _LINUX_CONTEXT_TRACKING_H + +#ifdef CONFIG_CONTEXT_TRACKING +#include + +extern void user_enter(void); +extern void user_exit(void); +extern void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next); +#else +static inline void user_enter(void) { } +static inline void user_exit(void) { } +static inline void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { } +#endif /* !CONFIG_CONTEXT_TRACKING */ + +#endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8fe7c1840d3..275aa3f1062 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -222,8 +222,6 @@ extern void rcu_user_enter(void); extern void rcu_user_exit(void); extern void rcu_user_enter_after_irq(void); extern void rcu_user_exit_after_irq(void); -extern void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } diff --git a/init/Kconfig b/init/Kconfig index 5ac6ee09422..2054e048bb9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -486,9 +486,13 @@ config PREEMPT_RCU This option enables preemptible-RCU code that is common between the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. +config CONTEXT_TRACKING + bool + config RCU_USER_QS bool "Consider userspace as in RCU extended quiescent state" - depends on HAVE_RCU_USER_QS && SMP + depends on HAVE_CONTEXT_TRACKING && SMP + select CONTEXT_TRACKING help This option sets hooks on kernel / userspace boundaries and puts RCU in extended quiescent state when the CPU runs in @@ -497,24 +501,20 @@ config RCU_USER_QS try to keep the timer tick on for RCU. Unless you want to hack and help the development of the full - tickless feature, you shouldn't enable this option. It also + dynticks mode, you shouldn't enable this option. It also adds unnecessary overhead. If unsure say N -config RCU_USER_QS_FORCE - bool "Force userspace extended QS by default" - depends on RCU_USER_QS +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING help - Set the hooks in user/kernel boundaries by default in order to - test this feature that treats userspace as an extended quiescent - state until we have a real user like a full adaptive nohz option. - - Unless you want to hack and help the development of the full - tickless feature, you shouldn't enable this option. It adds - unnecessary overhead. - - If unsure say N + Probe on user/kernel boundaries by default in order to + test the features that rely on it such as userspace RCU extended + quiescent states. + This test is there for debugging until we have a real user like the + full dynticks mode. config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324e..f90bbfc9727 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -110,6 +110,7 @@ obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c new file mode 100644 index 00000000000..e0e07fd5550 --- /dev/null +++ b/kernel/context_tracking.c @@ -0,0 +1,83 @@ +#include +#include +#include +#include +#include + +struct context_tracking { + /* + * When active is false, hooks are not set to + * minimize overhead: TIF flags are cleared + * and calls to user_enter/exit are ignored. This + * may be further optimized using static keys. + */ + bool active; + enum { + IN_KERNEL = 0, + IN_USER, + } state; +}; + +static DEFINE_PER_CPU(struct context_tracking, context_tracking) = { +#ifdef CONFIG_CONTEXT_TRACKING_FORCE + .active = true, +#endif +}; + +void user_enter(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.active) && + __this_cpu_read(context_tracking.state) != IN_USER) { + __this_cpu_write(context_tracking.state, IN_USER); + rcu_user_enter(); + } + local_irq_restore(flags); +} + +void user_exit(void) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + if (__this_cpu_read(context_tracking.state) == IN_USER) { + __this_cpu_write(context_tracking.state, IN_KERNEL); + rcu_user_exit(); + } + local_irq_restore(flags); +} + +void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (__this_cpu_read(context_tracking.active)) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7733eb56e15..e441b77b614 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -207,9 +207,6 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) - .ignore_user_qs = true, -#endif }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -420,29 +417,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - WARN_ON_ONCE(!current->mm); - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs && !rdtp->in_user) { - rdtp->in_user = true; - rcu_eqs_enter(true); - } - local_irq_restore(flags); + rcu_eqs_enter(1); } /** @@ -579,27 +554,7 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - if (rdtp->in_user) { - rdtp->in_user = false; - rcu_eqs_exit(true); - } - local_irq_restore(flags); + rcu_eqs_exit(1); } /** @@ -722,21 +677,6 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_RCU_USER_QS -void rcu_user_hooks_switch(struct task_struct *prev, - struct task_struct *next) -{ - struct rcu_dynticks *rdtp; - - /* Interrupts are disabled in context switch */ - rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->ignore_user_qs) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } -} -#endif /* #ifdef CONFIG_RCU_USER_QS */ - #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 36f260864f6..80f80dfca70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include @@ -1886,8 +1887,8 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ - rcu_user_hooks_switch(prev, next); switch_to(prev, next, prev); barrier(); @@ -2911,7 +2912,7 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_CONTEXT_TRACKING asmlinkage void __sched schedule_user(void) { /* @@ -2920,9 +2921,9 @@ asmlinkage void __sched schedule_user(void) * we haven't yet exited the RCU idle mode. Do it here manually until * we find a better solution. */ - rcu_user_exit(); + user_exit(); schedule(); - rcu_user_enter(); + user_enter(); } #endif @@ -3027,7 +3028,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); - rcu_user_exit(); + user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); -- cgit v1.2.3-70-g09d2 From 644c154186386bb1fa6446bc5e037b9ed098db46 Mon Sep 17 00:00:00 2001 From: Vincent Palatin Date: Fri, 30 Nov 2012 12:15:32 -0800 Subject: x86, fpu: Avoid FPU lazy restore after suspend When a cpu enters S3 state, the FPU state is lost. After resuming for S3, if we try to lazy restore the FPU for a process running on the same CPU, this will result in a corrupted FPU context. Ensure that "fpu_owner_task" is properly invalided when (re-)initializing a CPU, so nobody will try to lazy restore a state which doesn't exist in the hardware. Tested with a 64-bit kernel on a 4-core Ivybridge CPU with eagerfpu=off, by doing thousands of suspend/resume cycles with 4 processes doing FPU operations running. Without the patch, a process is killed after a few hundreds cycles by a SIGFPE. Cc: Duncan Laurie Cc: Olof Johansson Cc: v3.4+ # for 3.4 need to replace this_cpu_write by percpu_write Signed-off-by: Vincent Palatin Link: http://lkml.kernel.org/r/1354306532-1014-1-git-send-email-vpalatin@chromium.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/fpu-internal.h | 15 +++++++++------ arch/x86/kernel/smpboot.c | 5 +++++ 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 831dbb9c6c0..41ab26ea656 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -399,14 +399,17 @@ static inline void drop_init_fpu(struct task_struct *tsk) typedef struct { int preload; } fpu_switch_t; /* - * FIXME! We could do a totally lazy restore, but we need to - * add a per-cpu "this was the task that last touched the FPU - * on this CPU" variable, and the task needs to have a "I last - * touched the FPU on this CPU" and check them. + * Must be run with preemption disabled: this clears the fpu_owner_task, + * on this CPU. * - * We don't do that yet, so "fpu_lazy_restore()" always returns - * false, but some day.. + * This will disable any lazy FPU state restore of the current FPU state, + * but if the current thread owns the FPU, it will still be saved by. */ +static inline void __cpu_disable_lazy_restore(unsigned int cpu) +{ + per_cpu(fpu_owner_task, cpu) = NULL; +} + static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) { return new == this_cpu_read_stable(fpu_owner_task) && diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c80a33bc528..f3e2ec878b8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,8 @@ #include #include #include +#include +#include #include #include #include @@ -818,6 +820,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_debug("do_boot_cpu failed %d\n", err); -- cgit v1.2.3-70-g09d2 From f99024729e689f5de4534fde5400e3b035f068de Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:52 -0800 Subject: mm: use vm_unmapped_area() on x86_64 architecture Update the x86_64 arch_get_unmapped_area[_topdown] functions to make use of vm_unmapped_area() instead of implementing a brute force search. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/elf.h | 6 +- arch/x86/kernel/sys_x86_64.c | 151 +++++++++---------------------------------- arch/x86/vdso/vma.c | 2 +- 3 files changed, 33 insertions(+), 126 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5939f44fe0c..9c999c1674f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -354,12 +354,10 @@ static inline int mmap_is_ia32(void) return 0; } -/* The first two values are special, do not change. See align_addr() */ +/* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), ALIGN_VA_64 = BIT(1), - ALIGN_VDSO = BIT(2), - ALIGN_TOPDOWN = BIT(3), }; struct va_alignment { @@ -368,5 +366,5 @@ struct va_alignment { } ____cacheline_aligned; extern struct va_alignment va_align; -extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); +extern unsigned long align_vdso_addr(unsigned long); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index b4d3c3927dd..f00d006d60f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -21,37 +21,23 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. - * - * @flags denotes the allocation direction - bottomup or topdown - - * or vDSO; see call sites below. */ -unsigned long align_addr(unsigned long addr, struct file *filp, - enum align_flags flags) +static unsigned long get_align_mask(void) { - unsigned long tmp_addr; - /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) - return addr; + return 0; if (!(current->flags & PF_RANDOMIZE)) - return addr; - - if (!((flags & ALIGN_VDSO) || filp)) - return addr; - - tmp_addr = addr; - - /* - * We need an address which is <= than the original - * one only when in topdown direction. - */ - if (!(flags & ALIGN_TOPDOWN)) - tmp_addr += va_align.mask; + return 0; - tmp_addr &= ~va_align.mask; + return va_align.mask; +} - return tmp_addr; +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + return (addr + align_mask) & ~align_mask; } static int __init control_va_addr_alignment(char *str) @@ -126,7 +112,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; unsigned long begin, end; if (flags & MAP_FIXED) @@ -144,50 +130,16 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_ADDR32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = begin; - } - addr = mm->free_area_cache; - if (addr < begin) - addr = begin; - start_addr = addr; - -full_search: - - addr = align_addr(addr, filp, 0); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (end - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != begin) { - start_addr = addr = begin; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - addr = align_addr(addr, filp, 0); - } + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = 0; + return vm_unmapped_area(&info); } - unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, @@ -195,7 +147,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -217,51 +170,16 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - addr = align_addr(addr, filp, ALIGN_TOPDOWN); - - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return mm->free_area_cache = addr; - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = 0; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); bottomup: /* @@ -270,14 +188,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; - - return addr; + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 00aaf047b39..431e8754441 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -141,7 +141,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) * unaligned here as a result of stack start randomization. */ addr = PAGE_ALIGN(addr); - addr = align_addr(addr, NULL, ALIGN_VDSO); + addr = align_vdso_addr(addr); return addr; } -- cgit v1.2.3-70-g09d2 From 7d025059650f1c41a427173789ac14b74212b361 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:56 -0800 Subject: mm: fix cache coloring on x86_64 architecture Fix the x86-64 cache alignment code to take pgoff into account. Use the x86 and MIPS cache alignment code as the basis for a generic cache alignment function. The old x86 code will always align the mmap to aliasing boundaries, even if the program mmaps the file with a non-zero pgoff. If program A mmaps the file with pgoff 0, and program B mmaps the file with pgoff 1. The old code would align the mmaps, resulting in misaligned pages: A: 0123 B: 123 After this patch, they are aligned so the pages line up: A: 0123 B: 123 Proposed by Rik van Riel. Signed-off-by: Michel Lespinasse Cc: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/sys_x86_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index f00d006d60f..97ef74b88e0 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -136,7 +136,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = begin; info.high_limit = end; info.align_mask = filp ? get_align_mask() : 0; - info.align_offset = 0; + info.align_offset = pgoff << PAGE_SHIFT; return vm_unmapped_area(&info); } @@ -175,7 +175,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; info.align_mask = filp ? get_align_mask() : 0; - info.align_offset = 0; + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; -- cgit v1.2.3-70-g09d2