From 40747ffa5aa8d5b99ca46c696234b9194b59e0ac Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 8 Feb 2014 08:51:59 +0100 Subject: asmlinkage: Make jiffies visible Jiffies is referenced by the linker script, so it has to be visible. Handled both the generic and the x86 version. Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1391845930-28580-3-git-send-email-ak@linux.intel.com Signed-off-by: H. Peter Anvin --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e..d78de047599 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -52,7 +52,7 @@ #define CREATE_TRACE_POINTS #include -u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; +__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); -- cgit v1.2.3-70-g09d2 From 8ba14654282ed6bb386d0a2f1ab329bfb293403f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 10 Feb 2014 17:09:54 +0100 Subject: timer: Spare IPI when deferrable timer is queued on idle remote targets When a timer is enqueued or modified on a remote target, the latter is expected to see and handle this timer on its next tick. However if the target is idle and CONFIG_NO_HZ_IDLE=y, the CPU may be sleeping tickless and the timer may be ignored. wake_up_nohz_cpu() takes care of that by setting TIF_NEED_RESCHED and sending an IPI to idle targets so that the tick is reevaluated on the idle loop through the tick_nohz_idle_*() APIs. Now this is all performed regardless of the power properties of the timer. If the timer is deferrable, idle targets don't need to be woken up. Only the next buzy tick needs to care about it, and no IPI kick is needed for that to happen. So lets spare the IPI on idle targets when the timer is deferrable. Meanwhile we keep the current behaviour on full dynticks targets. We can spare IPIs on idle full dynticks targets as well but some tricky races against idle_cpu() must be dealt all along to make sure that the timer is well handled after idle exit. We can deal with that later since NO_HZ_FULL already has more important powersaving issues. Reported-by: Thomas Gleixner Signed-off-by: Viresh Kumar Cc: Ingo Molnar Cc: Paul Gortmaker Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAKohpomMZ0TAN2e6N76_g4ZRzxd5vZ1XfuZfxrP7GMxfTNiLVw@mail.gmail.com Signed-off-by: Frederic Weisbecker --- kernel/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e..b75e7893be1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -939,8 +939,15 @@ void add_timer_on(struct timer_list *timer, int cpu) * with the timer by holding the timer base lock. This also * makes sure that a CPU on the way to stop its tick can not * evaluate the timer wheel. + * + * Spare the IPI for deferrable timers on idle targets though. + * The next busy ticks will take care of it. Except full dynticks + * require special care against races with idle_cpu(), lets deal + * with that later. */ - wake_up_nohz_cpu(cpu); + if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) + wake_up_nohz_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); -- cgit v1.2.3-70-g09d2 From fff421580f512fc044cc7421fdff31a7a6997350 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Jan 2014 20:20:43 -0800 Subject: timers: Track total number of timers in list Currently, the tvec_base structure's ->active_timers field tracks only the non-deferrable timers, which means that even if ->active_timers is zero, there might well be deferrable timers in the list. This commit therefore adds an ->all_timers field to track all the timers, whether deferrable or not. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index accfd241b9e..fdc43834f3a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -81,6 +81,7 @@ struct tvec_base { unsigned long timer_jiffies; unsigned long next_timer; unsigned long active_timers; + unsigned long all_timers; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -392,6 +393,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) base->next_timer = timer->expires; base->active_timers++; } + base->all_timers++; } #ifdef CONFIG_TIMER_STATS @@ -671,6 +673,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) detach_timer(timer, true); if (!tbase_get_deferrable(timer->base)) base->active_timers--; + base->all_timers--; } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -685,6 +688,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } + base->all_timers--; return 1; } @@ -1559,6 +1563,7 @@ static int init_timers_cpu(int cpu) base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; base->active_timers = 0; + base->all_timers = 0; return 0; } -- cgit v1.2.3-70-g09d2 From d550e81dc0ddc04f1b417c179c214103a28e0ee8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 05:57:10 -0800 Subject: timers: Reduce __run_timers() latency for empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. In this case, which is likely to be common for NO_HZ_FULL kernels, the kernel currently incurs a large latency for no good reason. This commit therefore short-circuits this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index fdc43834f3a..c8bc7091d8f 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -338,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); +/* + * If the list is empty, catch up ->timer_jiffies to the current time. + * The caller must hold the tvec_base lock. Returns true if the list + * was empty and therefore ->timer_jiffies was updated. + */ +static bool catchup_timer_jiffies(struct tvec_base *base) +{ + if (!base->all_timers) { + base->timer_jiffies = jiffies; + return true; + } + return false; +} + static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { @@ -1150,6 +1164,10 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); + if (catchup_timer_jiffies(base)) { + spin_unlock_irq(&base->lock); + return; + } while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; -- cgit v1.2.3-70-g09d2 From 16d937f880312e3f47157d4d6d6ebf7e61523378 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:32:01 -0800 Subject: timers: Reduce future __run_timers() latency for newly emptied list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, if we just emptied the timer wheel, for example, by deleting the last timer, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore catches ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index c8bc7091d8f..dfac34f7186 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -688,6 +688,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) if (!tbase_get_deferrable(timer->base)) base->active_timers--; base->all_timers--; + (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -703,6 +704,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, base->next_timer = base->timer_jiffies; } base->all_timers--; + (void)catchup_timer_jiffies(base); return 1; } -- cgit v1.2.3-70-g09d2 From 18d8cb64c9c074cbe2bd677ab10fff8283abdb62 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Dec 2013 10:41:50 -0800 Subject: timers: Reduce future __run_timers() latency for first add to empty list The __run_timers() function currently steps through the list one jiffy at a time in order to update the timer wheel. However, if the timer wheel is empty, no adjustment is needed other than updating ->timer_jiffies. Therefore, just before we add a timer to an empty timer wheel, we should mark the timer wheel as being up to date. This marking will reduce (and perhaps eliminate) the jiffy-stepping that a future __run_timers() call will need to do in response to some future timer posting or migration. This commit therefore updates ->timer_jiffies for this case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Reviewed-by: Oleg Nesterov Reviewed-by: Steven Rostedt Tested-by: Mike Galbraith --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index dfac34f7186..0c638cf3d9d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -398,6 +398,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { + (void)catchup_timer_jiffies(base); __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer -- cgit v1.2.3-70-g09d2 From aea369b959bef10d235cd0714789cd8b0fe170b8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Jan 2014 16:19:27 -0800 Subject: timers: Make internal_add_timer() update ->next_timer if ->active_timers == 0 The internal_add_timer() function updates base->next_timer only if timer->expires < base->next_timer. This is correct, but it also makes sense to do the same if we add the first non-deferrable timer. Signed-off-by: Oleg Nesterov Reviewed-by: Steven Rostedt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Peter Zijlstra Tested-by: Mike Galbraith --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index 0c638cf3d9d..c0d8898fed9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,9 +404,9 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * Update base->active_timers and base->next_timer */ if (!tbase_get_deferrable(timer->base)) { - if (time_before(timer->expires, base->next_timer)) + if (!base->active_timers++ || + time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; - base->active_timers++; } base->all_timers++; } -- cgit v1.2.3-70-g09d2 From c24a4a369419c360c323865b91198878275c1481 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 28 Feb 2014 14:15:21 +0530 Subject: timer: Check failure of timer_cpu_notify() before calling init_timer_stats() timer_cpu_notify() should return NOTIFY_OK and nothing else. Anything else would trigger a BUG_ON(). Return value of this routine is already checked correctly but is done after issuing a call to init_timer_stats(). The right order would be to check the error case first and then call init_timer_stats(). Lets do it. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: tj@kernel.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/c439f5b6bbc2047e1662f4d523350531425bcf9d.1393576981.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index a71bdfdb51e..31824ef3eb9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1681,9 +1681,9 @@ void __init init_timers(void) err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); - init_timer_stats(); - BUG_ON(err != NOTIFY_OK); + + init_timer_stats(); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.2.3-70-g09d2 From 38edbb0b913d73713c23dcc742669f7e78b52aa7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 28 Feb 2014 14:15:22 +0530 Subject: timer: Make sure TIMER_FLAG_MASK bits are free in allocated base Currently we are using two lowest bit of base for internal purpose and so they both should be zero in the allocated address. The code was doing the right thing before this patch came in: commit c5f66e99b (timer: Implement TIMER_IRQSAFE) Tejun probably forgot to update this piece of code which checks if the lowest 'n' bits are zero or not and so wasn't updated according to the new flag. Lets use TIMER_FLAG_MASK in the calculations here. [ tglx: Massaged changelog ] Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: tj@kernel.org Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/9144e10d7e854a0aa8a673332adec356d81a923c.1393576981.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index 31824ef3eb9..949d74ea0ce 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1555,9 +1555,8 @@ static int init_timers_cpu(int cpu) if (!base) return -ENOMEM; - /* Make sure that tvec_base is 2 byte aligned */ - if (tbase_get_deferrable(base)) { - WARN_ON(1); + /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ + if (WARN_ON(base != tbase_get_base(base))) { kfree(base); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From c41eba7de133e43ea2c998ccd484059feab200f6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 18 Mar 2014 13:23:15 +0530 Subject: timer: Use variable head instead of &work_list in __run_timers() We already have a variable 'head' that points to '&work_list', and so we should use that instead wherever possible. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/0d8645a6efc8360c4196c9797d59343abbfdcc5e.1395129136.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/timer.c') diff --git a/kernel/timer.c b/kernel/timer.c index 949d74ea0ce..8e503fec1fb 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1192,7 +1192,7 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, &work_list); + list_replace_init(base->tv1.vec + index, head); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; -- cgit v1.2.3-70-g09d2 From 6201b4d61fbf194df6371fb3376c5026cb8f5eec Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 18 Mar 2014 16:26:07 +0530 Subject: timer: Remove code redundancy while calling get_nohz_timer_target() There are only two users of get_nohz_timer_target(): timer and hrtimer. Both call it under same circumstances, i.e. #ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif So, it makes more sense to get all this as part of get_nohz_timer_target() instead of duplicating code at two places. For this another parameter is required to be passed to this routine, pinned. Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: fweisbec@gmail.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1e1b53537217d58d48c2d7a222a9c3ac47d5b64c.1395140107.git.viresh.kumar@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 6 +++++- kernel/hrtimer.c | 15 +-------------- kernel/sched/core.c | 5 ++++- kernel/timer.c | 7 +------ 4 files changed, 11 insertions(+), 22 deletions(-) (limited to 'kernel/timer.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a..6f6c56f63c6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,10 +291,14 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(void); +extern int get_nohz_timer_target(int pinned); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } +static inline int get_nohz_timer_target(int pinned) +{ + return smp_processor_id(); +} #endif /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 09094361dce..d55092ceee2 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } - -/* - * Get the preferred target CPU for NOHZ - */ -static int hrtimer_get_target(int this_cpu, int pinned) -{ -#ifdef CONFIG_NO_HZ_COMMON - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) - return get_nohz_timer_target(); -#endif - return this_cpu; -} - /* * With HIGHRES=y we do not migrate the timer when it is expiring * before the next event on the target cpu because we cannot reprogram @@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); - int cpu = hrtimer_get_target(this_cpu, pinned); + int cpu = get_nohz_timer_target(pinned); int basenum = base->index; again: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aa..c0339e206cc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -555,12 +555,15 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(void) +int get_nohz_timer_target(int pinned) { int cpu = smp_processor_id(); int i; struct sched_domain *sd; + if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + return cpu; + rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { diff --git a/kernel/timer.c b/kernel/timer.c index 8e503fec1fb..1d35ddadc04 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -760,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = smp_processor_id(); - -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) - cpu = get_nohz_timer_target(); -#endif + cpu = get_nohz_timer_target(pinned); new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { -- cgit v1.2.3-70-g09d2