From 16824255394f55adf31b9a96a9965d8c15bdac4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Dec 2013 15:08:36 +0100 Subject: x86, acpi, idle: Restructure the mwait idle routines People seem to delight in writing wrong and broken mwait idle routines; collapse the lot. This leaves mwait_play_dead() the sole remaining user of __mwait() and new __mwait() users are probably doing it wrong. Also remove __sti_mwait() as its unused. Cc: Arjan van de Ven Cc: Jacob Jun Pan Cc: Mike Galbraith Cc: Len Brown Cc: Rui Zhang Acked-by: Rafael Wysocki Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131212141654.616820819@infradead.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mwait.h | 40 ++++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/processor.h | 23 ----------------------- 2 files changed, 40 insertions(+), 23 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 2f366d0ac6b..361b02ef128 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_MWAIT_H #define _ASM_X86_MWAIT_H +#include + #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf #define MWAIT_SUBSTATE_SIZE 4 @@ -13,4 +15,42 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __mwait(eax, ecx); + } + __current_clr_polling(); +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7b034a4057f..24821f5768b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -700,29 +700,6 @@ static inline void sync_core(void) #endif } -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax, %ecx, %edx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc8;" - :: "a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax, %ecx;" */ - asm volatile(".byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - -static inline void __sti_mwait(unsigned long eax, unsigned long ecx) -{ - trace_hardirqs_on(); - /* "mwait %eax, %ecx;" */ - asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" - :: "a" (eax), "c" (ecx)); -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void init_amd_e400_c1e_mask(void); -- cgit v1.2.3-70-g09d2 From 7e98b71920464b8d15fa95c74366416cd3c88861 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 19 Dec 2013 11:58:16 -0800 Subject: x86, idle: Use static_cpu_has() for CLFLUSH workaround, add barriers Use static_cpu_has() to conditionalize the CLFLUSH workaround, and add memory barriers around it since the documentation is explicit that CLFLUSH is only ordered with respect to MFENCE. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Len Brown Link: http://lkml.kernel.org/r/CA%2B55aFzGxcML7j8CEvQPYzh0W81uVoAAVmGctMOUZ7CZ1yYd2A@mail.gmail.com --- arch/x86/include/asm/mwait.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 361b02ef128..19b71c43925 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -43,8 +43,11 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) { + mb(); clflush((void *)¤t_thread_info()->flags); + mb(); + } __monitor((void *)¤t_thread_info()->flags, 0, 0); if (!need_resched()) -- cgit v1.2.3-70-g09d2 From 5dd12c2152743747ca9f50ef80281e54cc416dc0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 18:04:39 +0100 Subject: sched/clock, x86: Use mul_u64_u32_shr() for native_sched_clock() Use mul_u64_u32_shr() so that x86_64 can use a single 64x64->128 mul. Before: 0000000000000560 : 560: 44 8b 1d 00 00 00 00 mov 0x0(%rip),%r11d # 567 567: 55 push %rbp 568: 48 89 e5 mov %rsp,%rbp 56b: 45 85 db test %r11d,%r11d 56e: 75 4f jne 5bf 570: 0f 31 rdtsc 572: 89 c0 mov %eax,%eax 574: 48 c1 e2 20 shl $0x20,%rdx 578: 48 c7 c1 00 00 00 00 mov $0x0,%rcx 57f: 48 09 c2 or %rax,%rdx 582: 48 c7 c7 00 00 00 00 mov $0x0,%rdi 589: 65 8b 04 25 00 00 00 mov %gs:0x0,%eax 590: 00 591: 48 98 cltq 593: 48 8b 34 c5 00 00 00 mov 0x0(,%rax,8),%rsi 59a: 00 59b: 48 89 d0 mov %rdx,%rax 59e: 81 e2 ff 03 00 00 and $0x3ff,%edx 5a4: 48 c1 e8 0a shr $0xa,%rax 5a8: 48 0f af 14 0e imul (%rsi,%rcx,1),%rdx 5ad: 48 0f af 04 0e imul (%rsi,%rcx,1),%rax 5b2: 5d pop %rbp 5b3: 48 03 04 3e add (%rsi,%rdi,1),%rax 5b7: 48 c1 ea 0a shr $0xa,%rdx 5bb: 48 01 d0 add %rdx,%rax 5be: c3 retq After: 0000000000000550 : 550: 8b 3d 00 00 00 00 mov 0x0(%rip),%edi # 556 556: 55 push %rbp 557: 48 89 e5 mov %rsp,%rbp 55a: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 55e: 85 ff test %edi,%edi 560: 75 2c jne 58e 562: 0f 31 rdtsc 564: 89 c0 mov %eax,%eax 566: 48 c1 e2 20 shl $0x20,%rdx 56a: 48 09 c2 or %rax,%rdx 56d: 65 48 8b 04 25 00 00 mov %gs:0x0,%rax 574: 00 00 576: 89 c0 mov %eax,%eax 578: 48 f7 e2 mul %rdx 57b: 65 48 8b 0c 25 00 00 mov %gs:0x0,%rcx 582: 00 00 584: c9 leaveq 585: 48 0f ac d0 0a shrd $0xa,%rdx,%rax 58a: 48 01 c8 add %rcx,%rax 58d: c3 retq MAINLINE POST sched_clock_stable: 1 1 (cold) sched_clock: 329841 331312 (cold) local_clock: 301773 310296 (warm) sched_clock: 38375 38247 (warm) local_clock: 100371 102713 (warm) rdtsc: 27340 27289 sched_clock_stable: 0 0 (cold) sched_clock: 382634 372706 (cold) local_clock: 396890 399275 (warm) sched_clock: 38194 38124 (warm) local_clock: 143452 148698 (warm) rdtsc: 27345 27365 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-piu203ses5y1g36bnyw2n16x@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 34baa0eb5d0..10a78c03791 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -4,6 +4,7 @@ #include #include #include +#include #define TICK_SIZE (tick_nsec / 1000) @@ -57,10 +58,8 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); static inline unsigned long long __cycles_2_ns(unsigned long long cyc) { - int cpu = smp_processor_id(); - unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), - (1UL << CYC2NS_SCALE_FACTOR)); + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); return ns; } -- cgit v1.2.3-70-g09d2 From 57c67da274f3fab38e08d2c9edf08b89e1d9c71d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:39:25 +0100 Subject: sched/clock, x86: Move some cyc2ns() code around There are no __cycles_2_ns() users outside of arch/x86/kernel/tsc.c, so move it there. There are no cycles_2_ns() users. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-01lslnavfgo3kmbo4532zlcj@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 59 ----------------------- arch/x86/kernel/tsc.c | 112 +++++++++++++++++++++++-------------------- 2 files changed, 61 insertions(+), 110 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 10a78c03791..b4c667693a2 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,66 +13,7 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - * - * In: - * - * ns = cycles * cyc2ns_scale / SC - * - * Although we may still have enough bits to store the value of ns, - * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, - * leading to an incorrect result. - * - * To avoid this, we can decompose 'cycles' into quotient and remainder - * of division by SC. Then, - * - * ns = (quot * SC + rem) * cyc2ns_scale / SC - * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC - * - * - sqazi@google.com - */ - DECLARE_PER_CPU(unsigned long, cyc2ns); DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline unsigned long long __cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); - return ns; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long flags; - - local_irq_save(flags); - ns = __cycles_2_ns(cyc); - local_irq_restore(flags); - - return ns; -} - #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f56..b4a04ac1d7a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -38,6 +38,66 @@ static int __read_mostly tsc_unstable; static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; + +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +DEFINE_PER_CPU(unsigned long, cyc2ns); +DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + unsigned long long ns = this_cpu_read(cyc2ns_offset); + ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + return ns; +} + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now, *offset; + unsigned long flags, *scale; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + offset = &per_cpu(cyc2ns_offset, cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + if (cpu_khz) { + *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + + cpu_khz / 2) / cpu_khz; + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); + } + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -62,7 +122,7 @@ u64 native_sched_clock(void) rdtscll(this_offset); /* return the value in ns */ - return __cycles_2_ns(this_offset); + return cycles_2_ns(this_offset); } /* We need to define a real function for sched_clock, to override the @@ -589,56 +649,6 @@ int recalibrate_cpu_khz(void) EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better precision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); - - rdtscll(tsc_now); - ns_now = __cycles_2_ns(tsc_now); - - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } - - sched_clock_idle_wakeup_event(0); - local_irq_restore(flags); -} - static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) -- cgit v1.2.3-70-g09d2 From 20d1c86a57762f0a33a78988e3fc8818316badd4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Nov 2013 15:40:29 +0100 Subject: sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs Use a ring-buffer like multi-version object structure which allows always having a coherent object; we use this to avoid having to disable IRQs while reading sched_clock() and avoids a problem when getting an NMI while changing the cyc2ns data. MAINLINE PRE POST sched_clock_stable: 1 1 1 (cold) sched_clock: 329841 331312 257223 (cold) local_clock: 301773 310296 309889 (warm) sched_clock: 38375 38247 25280 (warm) local_clock: 100371 102713 85268 (warm) rdtsc: 27340 27289 24247 sched_clock_stable: 0 0 0 (cold) sched_clock: 382634 372706 301224 (cold) local_clock: 396890 399275 399870 (warm) sched_clock: 38194 38124 25630 (warm) local_clock: 143452 148698 129629 (warm) rdtsc: 27345 27365 24307 Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-s567in1e5ekq2nlyhn8f987r@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/timer.h | 23 +++- arch/x86/kernel/cpu/perf_event.c | 14 ++- arch/x86/kernel/tsc.c | 229 +++++++++++++++++++++++++++++++++++---- arch/x86/platform/uv/tlb_uv.c | 66 ++++++----- 4 files changed, 276 insertions(+), 56 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index b4c667693a2..3de54ef0aea 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -13,7 +13,26 @@ extern int recalibrate_cpu_khz(void); extern int no_timer_check; -DECLARE_PER_CPU(unsigned long, cyc2ns); -DECLARE_PER_CPU(unsigned long long, cyc2ns_offset); +/* + * We use the full linear equation: f(x) = a + b*x, in order to allow + * a continuous function in the face of dynamic freq changes. + * + * Continuity means that when our frequency changes our slope (b); we want to + * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t. + * + * Without an offset (a) the above would not be possible. + * + * See the comment near cycles_2_ns() for details on how we compute (b). + */ +struct cyc2ns_data { + u32 cyc2ns_mul; + u32 cyc2ns_shift; + u64 cyc2ns_offset; + u32 __count; + /* u32 hole */ +}; /* 24 bytes -- do not grow */ + +extern struct cyc2ns_data *cyc2ns_read_begin(void); +extern void cyc2ns_read_end(struct cyc2ns_data *); #endif /* _ASM_X86_TIMER_H */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614..9f97bd03f74 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,6 +1883,8 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + struct cyc2ns_data *data; + userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; @@ -1891,13 +1893,17 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!sched_clock_stable) return; + data = cyc2ns_read_begin(); + userpg->cap_user_time = 1; - userpg->time_mult = this_cpu_read(cyc2ns); - userpg->time_shift = CYC2NS_SCALE_FACTOR; - userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; userpg->cap_user_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b4a04ac1d7a..92b090b2b79 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -39,7 +39,119 @@ static int __read_mostly tsc_disabled = -1; int tsc_clocksource_reliable; -/* Accelerators for sched_clock() +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() * convert from cycles(64bits) => nanoseconds (64bits) * basic equation: * ns = cycles / (freq / ns_per_sec) @@ -61,49 +173,106 @@ int tsc_clocksource_reliable; * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - unsigned long long ns = this_cpu_read(cyc2ns_offset); - ns += mul_u64_u32_shr(cyc, this_cpu_read(cyc2ns), CYC2NS_SCALE_FACTOR); + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable(); + return ns; } +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - unsigned long long tsc_now, ns_now, *offset; - unsigned long flags, *scale; + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; local_irq_save(flags); sched_clock_idle_sleep_event(); - scale = &per_cpu(cyc2ns, cpu); - offset = &per_cpu(cyc2ns_offset, cpu); + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); rdtscll(tsc_now); ns_now = cycles_2_ns(tsc_now); - if (cpu_khz) { - *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + - cpu_khz / 2) / cpu_khz; - *offset = ns_now - mult_frac(tsc_now, *scale, - (1UL << CYC2NS_SCALE_FACTOR)); - } + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); +done: sched_clock_idle_wakeup_event(0); local_irq_restore(flags); } - /* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) { - u64 this_offset; + u64 tsc_now; /* * Fall back to jiffies if there's no TSC available: @@ -119,10 +288,10 @@ u64 native_sched_clock(void) } /* read the Time Stamp Counter: */ - rdtscll(this_offset); + rdtscll(tsc_now); /* return the value in ns */ - return cycles_2_ns(this_offset); + return cycles_2_ns(tsc_now); } /* We need to define a real function for sched_clock, to override the @@ -678,11 +847,21 @@ void tsc_restore_sched_clock_state(void) local_irq_save(flags); - __this_cpu_write(cyc2ns_offset, 0); + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + offset = cyc2ns_suspend - sched_clock(); - for_each_possible_cpu(cpu) - per_cpu(cyc2ns_offset, cpu) = offset; + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } local_irq_restore(flags); } @@ -1005,8 +1184,10 @@ void __init tsc_init(void) * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) */ - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); set_cyc2ns_scale(cpu_khz, cpu); + } if (tsc_disabled > 0) return; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index efe4d722039..dfe605ac1bc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } -static inline unsigned long cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) { + struct cyc2ns_data *data = cyc2ns_read_begin(); unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number -- cgit v1.2.3-70-g09d2 From 8cb75e0c4ec9786b81439761eac1d18d4a931af3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Nov 2013 12:22:37 +0100 Subject: sched/preempt: Fix up missed PREEMPT_NEED_RESCHED folding With various drivers wanting to inject idle time; we get people calling idle routines outside of the idle loop proper. Therefore we need to be extra careful about not missing TIF_NEED_RESCHED -> PREEMPT_NEED_RESCHED propagations. While looking at this, I also realized there's a small window in the existing idle loop where we can miss TIF_NEED_RESCHED; when it hits right after the tif_need_resched() test at the end of the loop but right before the need_resched() test at the start of the loop. So move preempt_fold_need_resched() out of the loop where we're guaranteed to have TIF_NEED_RESCHED set. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-x9jgh45oeayzajz2mjt0y7d6@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mwait.h | 2 +- include/linux/preempt.h | 15 +++++++++++++++ include/linux/sched.h | 15 +++++++++++++++ kernel/cpu/idle.c | 17 ++++++++++------- kernel/sched/core.c | 3 +-- 5 files changed, 42 insertions(+), 10 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 19b71c43925..1da25a5f96f 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -53,7 +53,7 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) if (!need_resched()) __mwait(eax, ecx); } - __current_clr_polling(); + current_clr_polling(); } #endif /* _ASM_X86_MWAIT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd9ddf8af20..59749fc4832 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,6 +134,21 @@ do { \ #undef preempt_check_resched #endif +#ifdef CONFIG_PREEMPT +#define preempt_set_need_resched() \ +do { \ + set_preempt_need_resched(); \ +} while (0) +#define preempt_fold_need_resched() \ +do { \ + if (tif_need_resched()) \ + set_preempt_need_resched(); \ +} while (0) +#else +#define preempt_set_need_resched() do { } while (0) +#define preempt_fold_need_resched() do { } while (0) +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; diff --git a/include/linux/sched.h b/include/linux/sched.h index a0387522166..ffccdad050b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2745,6 +2745,21 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_task() */ + + preempt_fold_need_resched(); +} + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 988573a9a38..277f494c2a9 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -105,14 +105,17 @@ static void cpu_idle_loop(void) __current_set_polling(); } arch_cpu_idle_exit(); - /* - * We need to test and propagate the TIF_NEED_RESCHED - * bit here because we might not have send the - * reschedule IPI to idle tasks. - */ - if (tif_need_resched()) - set_preempt_need_resched(); } + + /* + * Since we fell out of the loop above, we know + * TIF_NEED_RESCHED must be set, propagate it into + * PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will + * not have had an IPI to fold the state for us. + */ + preempt_set_need_resched(); tick_nohz_idle_exit(); schedule_preempt_disabled(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 392c6f87906..0326c06953e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1510,8 +1510,7 @@ void scheduler_ipi(void) * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ - if (tif_need_resched()) - set_preempt_need_resched(); + preempt_fold_need_resched(); if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) -- cgit v1.2.3-70-g09d2