From a35b6466aabb051568b844e8c63f87a356d3d129 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Aug 2012 21:46:40 +0200 Subject: sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies Peter Portante reported that for large cgroup hierarchies (and or on large CPU counts) we get immense lock contention on rq->lock and stuff stops working properly. His workload was a ton of processes, each in their own cgroup, everybody idling except for a sporadic wakeup once every so often. It was found that: schedule() idle_balance() load_balance() local_irq_save() double_rq_lock() update_h_load() walk_tg_tree(tg_load_down) tg_load_down() Results in an entire cgroup hierarchy walk under rq->lock for every new-idle balance and since new-idle balance isn't throttled this results in a lot of work while holding the rq->lock. This patch does two things, it removes the work from under rq->lock based on the good principle of race and pray which is widely employed in the load-balancer as a whole. And secondly it throttles the update_h_load() calculation to max once per jiffy. I considered excluding update_h_load() for new-idle balance all-together, but purely relying on regular balance passes to update this data might not work out under some rare circumstances where the new-idle busiest isn't the regular busiest for a while (unlikely, but a nightmare to debug if someone hits it and suffers). Cc: pjt@google.com Cc: Larry Woodman Cc: Mike Galbraith Reported-by: Peter Portante Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-aaarrzfpnaam7pqrekofu8a6@git.kernel.org Signed-off-by: Thomas Gleixner --- kernel/sched/fair.c | 11 +++++++++-- kernel/sched/sched.h | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d0cc03b3e70..c219bf8d704 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3387,6 +3387,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + struct rq *rq = cpu_rq(cpu); + unsigned long now = jiffies; + + if (rq->h_load_throttle == now) + return; + + rq->h_load_throttle = now; + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); rcu_read_unlock(); @@ -4293,11 +4301,10 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); + update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - if (!env.loop) - update_h_load(env.src_cpu); /* * cur_ld_moved - load moved in current iteration diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c35a1a7dd4d..531411b1044 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -374,7 +374,11 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#endif +#ifdef CONFIG_SMP + unsigned long h_load_throttle; +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED struct list_head leaf_rt_rq_list; #endif -- cgit v1.2.3-70-g09d2 From bea6832cc8c4a0a9a65dd17da6aaa657fe27bc3e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 8 Aug 2012 11:27:15 +0200 Subject: sched: fix divide by zero at {thread_group,task}_times On architectures where cputime_t is 64 bit type, is possible to trigger divide by zero on do_div(temp, (__force u32) total) line, if total is a non zero number but has lower 32 bit's zeroed. Removing casting is not a good solution since some do_div() implementations do cast to u32 internally. This problem can be triggered in practice on very long lived processes: PID: 2331 TASK: ffff880472814b00 CPU: 2 COMMAND: "oraagent.bin" #0 [ffff880472a51b70] machine_kexec at ffffffff8103214b #1 [ffff880472a51bd0] crash_kexec at ffffffff810b91c2 #2 [ffff880472a51ca0] oops_end at ffffffff814f0b00 #3 [ffff880472a51cd0] die at ffffffff8100f26b #4 [ffff880472a51d00] do_trap at ffffffff814f03f4 #5 [ffff880472a51d60] do_divide_error at ffffffff8100cfff #6 [ffff880472a51e00] divide_error at ffffffff8100be7b [exception RIP: thread_group_times+0x56] RIP: ffffffff81056a16 RSP: ffff880472a51eb8 RFLAGS: 00010046 RAX: bc3572c9fe12d194 RBX: ffff880874150800 RCX: 0000000110266fad RDX: 0000000000000000 RSI: ffff880472a51eb8 RDI: 001038ae7d9633dc RBP: ffff880472a51ef8 R8: 00000000b10a3a64 R9: ffff880874150800 R10: 00007fcba27ab680 R11: 0000000000000202 R12: ffff880472a51f08 R13: ffff880472a51f10 R14: 0000000000000000 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff880472a51f00] do_sys_times at ffffffff8108845d #8 [ffff880472a51f40] sys_times at ffffffff81088524 #9 [ffff880472a51f80] system_call_fastpath at ffffffff8100b0f2 RIP: 0000003808caac3a RSP: 00007fcba27ab6d8 RFLAGS: 00000202 RAX: 0000000000000064 RBX: ffffffff8100b0f2 RCX: 0000000000000000 RDX: 00007fcba27ab6e0 RSI: 000000000076d58e RDI: 00007fcba27ab6e0 RBP: 00007fcba27ab700 R8: 0000000000000020 R9: 000000000000091b R10: 00007fcba27ab680 R11: 0000000000000202 R12: 00007fff9ca41940 R13: 0000000000000000 R14: 00007fcba27ac9c0 R15: 00007fff9ca41940 ORIG_RAX: 0000000000000064 CS: 0033 SS: 002b Cc: stable@vger.kernel.org Signed-off-by: Stanislaw Gruszka Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120808092714.GA3580@redhat.com Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2cb4e777799..e2c5841c7df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3142,6 +3142,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif +static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) +{ + u64 temp = (__force u64) rtime; + + temp *= (__force u64) utime; + + if (sizeof(cputime_t) == 4) + temp = div_u64(temp, (__force u32) total); + else + temp = div64_u64(temp, (__force u64) total); + + return (__force cputime_t) temp; +} + void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -3151,13 +3165,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) */ rtime = nsecs_to_cputime(p->se.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(utime, rtime, total); + else utime = rtime; /* @@ -3184,13 +3194,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - if (total) { - u64 temp = (__force u64) rtime; - - temp *= (__force u64) cputime.utime; - do_div(temp, (__force u32) total); - utime = (__force cputime_t) temp; - } else + if (total) + utime = scale_utime(cputime.utime, rtime, total); + else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); -- cgit v1.2.3-70-g09d2 From 35cf4e50b16331def6cfcbee11e49270b6db07f5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 7 Aug 2012 05:00:13 +0200 Subject: sched,cgroup: Fix up task_groups list With multiple instances of task_groups, for_each_rt_rq() is a noop, no task groups having been added to the rt.c list instance. This renders __enable/disable_runtime() and print_rt_stats() noop, the user (non) visible effect being that rt task groups are missing in /proc/sched_debug. Signed-off-by: Mike Galbraith Cc: stable@kernel.org # v3.3+ Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344308413.6846.7.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e2c5841c7df..6aa212f3c58 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7252,6 +7252,7 @@ int in_sched_functions(unsigned long addr) #ifdef CONFIG_CGROUP_SCHED struct task_group root_task_group; +LIST_HEAD(task_groups); #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 531411b1044..f6714d009e7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex; struct cfs_rq; struct rt_rq; -static LIST_HEAD(task_groups); +extern struct list_head task_groups; struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3-70-g09d2 From e221d028bb08b47e624c5f0a31732c642db9d19a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 7 Aug 2012 10:02:38 +0200 Subject: sched,rt: fix isolated CPUs leaving root_task_group indefinitely throttled Root task group bandwidth replenishment must service all CPUs, regardless of where the timer was last started, and regardless of the isolation mechanism, lest 'Quoth the Raven, "Nevermore"' become rt scheduling policy. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344326558.6968.25.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/rt.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 573e1ca0110..944cb68420e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -788,6 +788,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); +#ifdef CONFIG_RT_GROUP_SCHED + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest + * the timer run on a CPU which does not service all runqueues, + * potentially leaving other CPUs indefinitely throttled. If + * isolation is really required, the user will turn the throttle + * off to kill the perturbations it causes anyway. Meanwhile, + * this maintains functionality for boot and/or troubleshooting. + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +#endif for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); -- cgit v1.2.3-70-g09d2 From 8f6189684eb4e85e6c593cd710693f09c944450a Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 4 Aug 2012 05:44:14 +0200 Subject: sched: Fix migration thread runtime bogosity Make stop scheduler class do the same accounting as other classes, Migration threads can be caught in the act while doing exec balancing, leading to the below due to use of unmaintained ->se.exec_start. The load that triggered this particular instance was an apparently out of control heavily threaded application that does system monitoring in what equated to an exec bomb, with one of the VERY frequently migrated tasks being ps. %CPU PID USER CMD 99.3 45 root [migration/10] 97.7 53 root [migration/12] 97.0 57 root [migration/13] 90.1 49 root [migration/11] 89.6 65 root [migration/15] 88.7 17 root [migration/3] 80.4 37 root [migration/8] 78.1 41 root [migration/9] 44.2 13 root [migration/2] Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1344051854.6739.19.camel@marge.simpson.net Signed-off-by: Thomas Gleixner --- kernel/sched/stop_task.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 7b386e86fd2..da5eb5bed84 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->on_rq) + if (stop && stop->on_rq) { + stop->se.exec_start = rq->clock_task; return stop; + } return NULL; } @@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { + struct task_struct *curr = rq->curr; + u64 delta_exec; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); } static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) @@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) static void set_curr_task_stop(struct rq *rq) { + struct task_struct *stop = rq->stop; + + stop->se.exec_start = rq->clock_task; } static void switched_to_stop(struct rq *rq, struct task_struct *p) -- cgit v1.2.3-70-g09d2 From 0fe33aae0e94b4097dd433c9399e16e17d638cd8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: don't free_chunk() after fsnotify_add_mark() Don't do free_chunk() after fsnotify_add_mark(). That one does a delayed unref via the destroy list and this results in use-after-free. Signed-off-by: Miklos Szeredi Acked-by: Eric Paris CC: stable@vger.kernel.org --- kernel/audit_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 3a5ca582ba1..69a58515f43 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -259,7 +259,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -322,7 +322,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } -- cgit v1.2.3-70-g09d2 From a2140fc0cb0325bb6384e788edd27b9a568714e2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: fix refcounting in audit-tree Refcounting of fsnotify_mark in audit tree is broken. E.g: refcount create_chunk alloc_chunk 1 fsnotify_add_mark 2 untag_chunk fsnotify_get_mark 3 fsnotify_destroy_mark audit_tree_freeing_mark 2 fsnotify_put_mark 1 fsnotify_put_mark 0 via destroy_list fsnotify_mark_destroy -1 This was reported by various people as triggering Oops when stopping auditd. We could just remove the put_mark from audit_tree_freeing_mark() but that would break freeing via inode destruction. So this patch simply omits a put_mark after calling destroy_mark or adds a get_mark before. The additional get_mark is necessary where there's no other put_mark after fsnotify_destroy_mark() since it assumes that the caller is holding a reference (or the inode is keeping the mark pinned, not the case here AFAICS). Signed-off-by: Miklos Szeredi Reported-by: Valentin Avram Reported-by: Peter Moody Acked-by: Eric Paris CC: stable@vger.kernel.org --- kernel/audit_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 69a58515f43..2b2ffff9eca 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -250,7 +250,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -293,7 +292,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; Fallback: @@ -332,6 +330,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); + fsnotify_get_mark(entry); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry); return 0; @@ -412,6 +411,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); + fsnotify_get_mark(chunk_entry); fsnotify_destroy_mark(chunk_entry); fsnotify_put_mark(chunk_entry); @@ -445,7 +445,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } -- cgit v1.2.3-70-g09d2 From b3e8692b4dde5cf5fc60e4b95385229a72623182 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: clean up refcounting in audit-tree Drop the initial reference by fsnotify_init_mark early instead of audit_tree_freeing_mark() at destroy time. In the cases we destroy the mark before we drop the initial reference we need to get rid of the get_mark that balances the put_mark in audit_tree_freeing_mark(). Signed-off-by: Miklos Szeredi --- kernel/audit_tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 2b2ffff9eca..ed206fd88cc 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -292,6 +292,7 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); + fsnotify_put_mark(&new->mark); /* drop initial reference */ goto out; Fallback: @@ -330,7 +331,6 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); - fsnotify_get_mark(entry); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry); return 0; @@ -346,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) insert_hash(chunk); spin_unlock(&hash_lock); spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* drop initial reference */ return 0; } @@ -411,7 +412,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); - fsnotify_get_mark(chunk_entry); fsnotify_destroy_mark(chunk_entry); fsnotify_put_mark(chunk_entry); @@ -444,6 +444,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(chunk_entry); /* drop initial reference */ fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ return 0; } @@ -915,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); evict_chunk(chunk); - fsnotify_put_mark(entry); + + /* + * We are guaranteed to have at least one reference to the mark from + * either the inode or the caller of fsnotify_destroy_mark(). + */ + BUG_ON(atomic_read(&entry->refcnt) < 1); } static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, -- cgit v1.2.3-70-g09d2 From 4e8b14526ca7fb046a81c94002c1c43b6fdf0e9b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 8 Aug 2012 15:36:20 -0400 Subject: time: Improve sanity checking of timekeeping inputs Unexpected behavior could occur if the time is set to a value large enough to overflow a 64bit ktime_t (which is something larger then the year 2262). Also unexpected behavior could occur if large negative offsets are injected via adjtimex. So this patch improves the sanity check timekeeping inputs by improving the timespec_valid() check, and then makes better use of timespec_valid() to make sure we don't set the time to an invalid negative value or one that overflows ktime_t. Note: This does not protect from setting the time close to overflowing ktime_t and then letting natural accumulation cause the overflow. Reported-by: CAI Qian Reported-by: Sasha Levin Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Zhouping Liu Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1344454580-17031-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 7 ------- include/linux/time.h | 22 ++++++++++++++++++++-- kernel/time/timekeeping.c | 26 ++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 603bec2913b..06177ba10a1 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -58,13 +58,6 @@ union ktime { typedef union ktime ktime_t; /* Kill this */ -#define KTIME_MAX ((s64)~((u64)1 << 63)) -#if (BITS_PER_LONG == 64) -# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#else -# define KTIME_SEC_MAX LONG_MAX -#endif - /* * ktime_t definitions when using the 64-bit scalar representation: */ diff --git a/include/linux/time.h b/include/linux/time.h index c81c5e40fcb..b0bbd8f0130 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -107,11 +107,29 @@ static inline struct timespec timespec_sub(struct timespec lhs, return ts_delta; } +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#if (BITS_PER_LONG == 64) +# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#else +# define KTIME_SEC_MAX LONG_MAX +#endif + /* * Returns true if the timespec is norm, false if denorm: */ -#define timespec_valid(ts) \ - (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2b..898bef066a4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -463,6 +463,8 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -471,10 +473,17 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); write_sequnlock_irqrestore(&tk->lock, flags); @@ -482,7 +491,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -649,7 +658,20 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } seqlock_init(&tk->lock); -- cgit v1.2.3-70-g09d2 From 60916a9382e88fbf5e54fd36a3e658efd7ab7bed Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 16 Aug 2012 18:14:14 +0100 Subject: tracing/syscalls: Fix perf syscall tracing when syscall_nr == -1 syscall_get_nr can return -1 in the case that the task is not executing a system call. This patch fixes perf_syscall_{enter,exit} to check that the syscall number is valid before using it as an index into a bitmap. Link: http://lkml.kernel.org/r/1345137254-7377-1-git-send-email-will.deacon@arm.com Cc: Jason Baron Cc: Wade Farnsworth Cc: Frederic Weisbecker Signed-off-by: Will Deacon Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 60e4d787567..6b245f64c8d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; @@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; -- cgit v1.2.3-70-g09d2 From be53db6e4edd9dc013b21a929ad2b142dea8b9c0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Aug 2012 14:40:59 +1200 Subject: alpha: take a bunch of syscalls into osf_sys.c New helper: current_thread_info(). Allows to do a bunch of odd syscalls in C. While we are at it, there had never been a reason to do osf_getpriority() in assembler. We also get "namespace"-aware (read: consistent with getuid(2), etc.) behaviour from getx?id() syscalls now. Signed-off-by: Al Viro Signed-off-by: Michael Cree Acked-by: Matt Turner Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/ptrace.h | 5 +- arch/alpha/kernel/entry.S | 109 ---------------------------------------- arch/alpha/kernel/osf_sys.c | 49 ++++++++++++++++++ arch/alpha/kernel/systbls.S | 2 +- kernel/timer.c | 9 ---- 5 files changed, 54 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/ptrace.h b/arch/alpha/include/asm/ptrace.h index fd698a174f2..b87755a1955 100644 --- a/arch/alpha/include/asm/ptrace.h +++ b/arch/alpha/include/asm/ptrace.h @@ -76,7 +76,10 @@ struct switch_stack { #define task_pt_regs(task) \ ((struct pt_regs *) (task_stack_page(task) + 2*PAGE_SIZE) - 1) -#define force_successful_syscall_return() (task_pt_regs(current)->r0 = 0) +#define current_pt_regs() \ + ((struct pt_regs *) ((char *)current_thread_info() + 2*PAGE_SIZE) - 1) + +#define force_successful_syscall_return() (current_pt_regs()->r0 = 0) #endif diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 6d159cee5f2..22b0c4d414a 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -796,115 +796,6 @@ sys_rt_sigreturn: br ret_from_sys_call .end sys_rt_sigreturn - .align 4 - .globl sys_sethae - .ent sys_sethae -sys_sethae: - .prologue 0 - stq $16, 152($sp) - ret -.end sys_sethae - - .align 4 - .globl osf_getpriority - .ent osf_getpriority -osf_getpriority: - lda $sp, -16($sp) - stq $26, 0($sp) - .prologue 0 - - jsr $26, sys_getpriority - - ldq $26, 0($sp) - blt $0, 1f - - /* Return value is the unbiased priority, i.e. 20 - prio. - This does result in negative return values, so signal - no error by writing into the R0 slot. */ - lda $1, 20 - stq $31, 16($sp) - subl $1, $0, $0 - unop - -1: lda $sp, 16($sp) - ret -.end osf_getpriority - - .align 4 - .globl sys_getxuid - .ent sys_getxuid -sys_getxuid: - .prologue 0 - ldq $2, TI_TASK($8) - ldq $3, TASK_CRED($2) - ldl $0, CRED_UID($3) - ldl $1, CRED_EUID($3) - stq $1, 80($sp) - ret -.end sys_getxuid - - .align 4 - .globl sys_getxgid - .ent sys_getxgid -sys_getxgid: - .prologue 0 - ldq $2, TI_TASK($8) - ldq $3, TASK_CRED($2) - ldl $0, CRED_GID($3) - ldl $1, CRED_EGID($3) - stq $1, 80($sp) - ret -.end sys_getxgid - - .align 4 - .globl sys_getxpid - .ent sys_getxpid -sys_getxpid: - .prologue 0 - ldq $2, TI_TASK($8) - - /* See linux/kernel/timer.c sys_getppid for discussion - about this loop. */ - ldq $3, TASK_GROUP_LEADER($2) - ldq $4, TASK_REAL_PARENT($3) - ldl $0, TASK_TGID($2) -1: ldl $1, TASK_TGID($4) -#ifdef CONFIG_SMP - mov $4, $5 - mb - ldq $3, TASK_GROUP_LEADER($2) - ldq $4, TASK_REAL_PARENT($3) - cmpeq $4, $5, $5 - beq $5, 1b -#endif - stq $1, 80($sp) - ret -.end sys_getxpid - - .align 4 - .globl sys_alpha_pipe - .ent sys_alpha_pipe -sys_alpha_pipe: - lda $sp, -16($sp) - stq $26, 0($sp) - .prologue 0 - - mov $31, $17 - lda $16, 8($sp) - jsr $26, do_pipe_flags - - ldq $26, 0($sp) - bne $0, 1f - - /* The return values are in $0 and $20. */ - ldl $1, 12($sp) - ldl $0, 8($sp) - - stq $1, 80+16($sp) -1: lda $sp, 16($sp) - ret -.end sys_alpha_pipe - .align 4 .globl sys_execve .ent sys_execve diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 98a103621af..bc1acdda7a5 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1404,3 +1404,52 @@ SYSCALL_DEFINE3(osf_writev, unsigned long, fd, } #endif + +SYSCALL_DEFINE2(osf_getpriority, int, which, int, who) +{ + int prio = sys_getpriority(which, who); + if (prio >= 0) { + /* Return value is the unbiased priority, i.e. 20 - prio. + This does result in negative return values, so signal + no error */ + force_successful_syscall_return(); + prio = 20 - prio; + } + return prio; +} + +SYSCALL_DEFINE0(getxuid) +{ + current_pt_regs()->r20 = sys_geteuid(); + return sys_getuid(); +} + +SYSCALL_DEFINE0(getxgid) +{ + current_pt_regs()->r20 = sys_getegid(); + return sys_getgid(); +} + +SYSCALL_DEFINE0(getxpid) +{ + current_pt_regs()->r20 = sys_getppid(); + return sys_getpid(); +} + +SYSCALL_DEFINE0(alpha_pipe) +{ + int fd[2]; + int res = do_pipe_flags(fd, 0); + if (!res) { + /* The return values are in $0 and $20. */ + current_pt_regs()->r20 = fd[1]; + res = fd[0]; + } + return res; +} + +SYSCALL_DEFINE1(sethae, unsigned long, val) +{ + current_pt_regs()->hae = val; + return 0; +} diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S index b64157c98e0..2ac6b45c3e0 100644 --- a/arch/alpha/kernel/systbls.S +++ b/arch/alpha/kernel/systbls.S @@ -111,7 +111,7 @@ sys_call_table: .quad sys_socket .quad sys_connect .quad sys_accept - .quad osf_getpriority /* 100 */ + .quad sys_osf_getpriority /* 100 */ .quad sys_send .quad sys_recv .quad sys_sigreturn diff --git a/kernel/timer.c b/kernel/timer.c index a61c09374eb..8c5e7b908c6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1407,13 +1407,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - /** * sys_getpid - return the thread group id of the current process * @@ -1469,8 +1462,6 @@ SYSCALL_DEFINE0(getegid) return from_kgid_munged(current_user_ns(), current_egid()); } -#endif - static void process_timeout(unsigned long __data) { wake_up_process((struct task_struct *)__data); -- cgit v1.2.3-70-g09d2 From c7a3a88c938fbe3d70c2278e082b80eb830d1c58 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 19:10:42 +0200 Subject: uprobes: Fix mmap_region()'s mm->mm_rb corruption if uprobe_mmap() fails This patch fixes: https://bugzilla.redhat.com/show_bug.cgi?id=843640 If mmap_region()->uprobe_mmap() fails, unmap_and_free_vma path does unmap_region() but does not remove the soon-to-be-freed vma from rb tree. Actually there are more problems but this is how William noticed this bug. Perhaps we could do do_munmap() + return in this case, but in fact it is simply wrong to abort if uprobe_mmap() fails. Until at least we move the !UPROBE_COPY_INSN code from install_breakpoint() to uprobe_register(). For example, uprobe_mmap()->install_breakpoint() can fail if the probed insn is not supported (remember, uprobe_register() succeeds if nobody mmaps inode/offset), mmap() should not fail in this case. dup_mmap()->uprobe_mmap() is wrong too by the same reason, fork() can race with uprobe_register() and fail for no reason if it wins the race and does install_breakpoint() first. And, if nothing else, both mmap_region() and dup_mmap() return success if uprobe_mmap() fails. Change them to ignore the error code from uprobe_mmap(). Reported-and-tested-by: William Cohen Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: # v3.5 Cc: Anton Arapov Cc: William Cohen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120819171042.GB26957@redhat.com Signed-off-by: Ingo Molnar --- kernel/fork.c | 4 ++-- mm/mmap.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3bd2280d79f..2c8857e1285 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - if (file && uprobe_mmap(tmp)) - goto out; + if (file) + uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); diff --git a/mm/mmap.c b/mm/mmap.c index e3e86914f11..52e08fc9186 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1356,9 +1356,8 @@ out: } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); - if (file && uprobe_mmap(vma)) - /* matching probes but cannot insert */ - goto unmap_and_free_vma; + if (file) + uprobe_mmap(vma); return addr; -- cgit v1.2.3-70-g09d2 From f341861fb0b701139849f8a85c2d3cdff466e8e8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 15:05:14 +0200 Subject: task_work: add a scheduling point in task_work_run() It seems commit 4a9d4b024a31 ("switch fput to task_work_add") re- introduced the problem addressed in 944be0b22472 ("close_files(): add scheduling point") If a server process with a lot of files (say 2 million tcp sockets) is killed, we can spend a lot of time in task_work_run() and trigger a soft lockup. Signed-off-by: Eric Dumazet Signed-off-by: Linus Torvalds --- kernel/task_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 91d4e1742a0..d320d44903b 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -75,6 +75,7 @@ void task_work_run(void) p = q->next; q->func(q); q = p; + cond_resched(); } } } -- cgit v1.2.3-70-g09d2 From 784ffcbb96c3a97b4c64fd48b1dfe12ef3fcbcda Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:46 -0400 Subject: time: Ensure we normalize the timekeeper in tk_xtime_add Andreas noticed problems with resume on specific hardware after commit 1e75fa8b (time: Condense timekeeper.xtime into xtime_sec) combined with commit b44d50dca (time: Fix casting issue in tk_set_xtime and tk_xtime_add) After some digging I realized we aren't normalizing the timekeeper after the add. Add the missing normalize call. Reported-by: Andreas Schwab Tested-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 898bef066a4..258164ae45c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -115,6 +115,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) { tk->xtime_sec += ts->tv_sec; tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; + tk_normalize_xtime(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) -- cgit v1.2.3-70-g09d2 From 85dc8f05c93c8105987de9d7e7cebf15a72ff4ec Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 21 Aug 2012 20:30:47 -0400 Subject: time: Fix casting issue in timekeeping_forward_now arch_gettimeoffset returns a u32 value which when shifted by tk->shift can overflow. This issue was introduced with 1e75fa8be (time: Condense timekeeper.xtime into xtime_sec) Cast it to u64 first. Signed-off-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 258164ae45c..1dbf80ec769 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -277,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->xtime_nsec += cycle_delta * tk->mult; /* If arch requires, add in gettimeoffset() */ - tk->xtime_nsec += arch_gettimeoffset() << tk->shift; + tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift; tk_normalize_xtime(tk); -- cgit v1.2.3-70-g09d2 From 6ea565a9be32a3c8d1092017686f183b6d8c4514 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:48 -0400 Subject: time: Avoid potential shift overflow with large shift values Andreas Schwab noticed that the 1 << tk->shift could overflow if the shift value was greater than 30, since 1 would be a 32bit long on 32bit architectures. This issue was introduced by 1e75fa8be (time: Condense timekeeper.xtime into xtime_sec) Use 1ULL instead to ensure we don't overflow on the shift. Reported-by: Andreas Schwab Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-4-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1dbf80ec769..a5a9389c4c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1184,9 +1184,9 @@ static void update_wall_time(void) * the vsyscall implementations are converted to use xtime_nsec * (shifted nanoseconds), this can be killed. */ - remainder = tk->xtime_nsec & ((1 << tk->shift) - 1); + remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); tk->xtime_nsec -= remainder; - tk->xtime_nsec += 1 << tk->shift; + tk->xtime_nsec += 1ULL << tk->shift; tk->ntp_error += remainder << tk->ntp_error_shift; /* -- cgit v1.2.3-70-g09d2 From bf2ac312195155511a0f79325515cbb61929898a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 21 Aug 2012 20:30:49 -0400 Subject: time: Avoid making adjustments if we haven't accumulated anything If update_wall_time() is called and the current offset isn't large enough to accumulate, avoid re-calling timekeeping_adjust which may change the clock freq and can cause 1ns inconsistencies with CLOCK_REALTIME_COARSE/CLOCK_MONOTONIC_COARSE. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1345595449-34965-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a5a9389c4c3..0c1485e42be 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1152,6 +1152,10 @@ static void update_wall_time(void) offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < tk->cycle_interval) + goto out; + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, -- cgit v1.2.3-70-g09d2 From cee58483cf56e0ba355fdd97ff5e8925329aa936 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 31 Aug 2012 13:30:06 -0400 Subject: time: Move ktime_t overflow checking into timespec_valid_strict Andreas Bombe reported that the added ktime_t overflow checking added to timespec_valid in commit 4e8b14526ca7 ("time: Improve sanity checking of timekeeping inputs") was causing problems with X.org because it caused timeouts larger then KTIME_T to be invalid. Previously, these large timeouts would be clamped to KTIME_MAX and would never expire, which is valid. This patch splits the ktime_t overflow checking into a new timespec_valid_strict function, and converts the timekeeping codes internal checking to use this more strict function. Reported-and-tested-by: Andreas Bombe Cc: Zhouping Liu Cc: Ingo Molnar Cc: Prarit Bhargava Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- include/linux/time.h | 7 +++++++ kernel/time/timekeeping.c | 10 +++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index b0bbd8f0130..b51e664c83e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,6 +125,13 @@ static inline bool timespec_valid(const struct timespec *ts) /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c1485e42be..34e5eac8142 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -428,7 +428,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if (!timespec_valid(tv)) + if (!timespec_valid_strict(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -476,7 +476,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid(&tmp)) { + if (!timespec_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -659,7 +659,7 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); - if (!timespec_valid(&now)) { + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -667,7 +667,7 @@ void __init timekeeping_init(void) } read_boot_clock(&boot); - if (!timespec_valid(&boot)) { + if (!timespec_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -713,7 +713,7 @@ static struct timespec timekeeping_suspend_time; static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { - if (!timespec_valid(delta)) { + if (!timespec_valid_strict(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; -- cgit v1.2.3-70-g09d2 From f319da0c6894fcf55e21320e40506418a2aad629 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Aug 2012 11:26:57 +0200 Subject: sched: Fix load avg vs cpu-hotplug Rabik and Paul reported two different issues related to the same few lines of code. Rabik's issue is that the nr_uninterruptible migration code is wrong in that he sees artifacts due to this (Rabik please do expand in more detail). Paul's issue is that this code as it stands relies on us using stop_machine() for unplug, we all would like to remove this assumption so that eventually we can remove this stop_machine() usage altogether. The only reason we'd have to migrate nr_uninterruptible is so that we could use for_each_online_cpu() loops in favour of for_each_possible_cpu() loops, however since nr_uninterruptible() is the only such loop and its using possible lets not bother at all. The problem Rabik sees is (probably) caused by the fact that by migrating nr_uninterruptible we screw rq->calc_load_active for both rqs involved. So don't bother with fancy migration schemes (meaning we now have to keep using for_each_possible_cpu()) and instead fold any nr_active delta after we migrate all tasks away to make sure we don't have any skewed nr_active accounting. Reported-by: Rakib Mullick Reported-by: Paul E. McKenney Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345454817.23018.27.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbf1fd098dc..207a81c769d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5304,27 +5304,17 @@ void idle_task_exit(void) } /* - * While a dead CPU has no uninterruptible tasks queued at this point, - * it might still have a nonzero ->nr_uninterruptible counter, because - * for performance reasons the counter is not stricly tracking tasks to - * their home CPUs. So we just add the counter to another CPU's counter, - * to keep the global sum constant after CPU-down: - */ -static void migrate_nr_uninterruptible(struct rq *rq_src) -{ - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - - rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; - rq_src->nr_uninterruptible = 0; -} - -/* - * remove the tasks which were accounted by rq from calc_load_tasks. + * Since this CPU is going 'away' for a while, fold any nr_active delta + * we might have. Assumes we're called after migrate_tasks() so that the + * nr_active count is stable. + * + * Also see the comment "Global load-average calculations". */ -static void calc_global_load_remove(struct rq *rq) +static void calc_load_migrate(struct rq *rq) { - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + long delta = calc_load_fold_active(rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks); } /* @@ -5618,8 +5608,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); - migrate_nr_uninterruptible(rq); - calc_global_load_remove(rq); + calc_load_migrate(rq); break; #endif } -- cgit v1.2.3-70-g09d2 From 749c8814f08f12baa4a9c2812a7c6ede7d69507d Mon Sep 17 00:00:00 2001 From: Charles Wang Date: Mon, 20 Aug 2012 16:02:33 +0800 Subject: sched: Add missing call to calc_load_exit_idle() Azat Khuzhin reported high loadavg in Linux v3.6 After checking the upstream scheduler code, I found Peter's commit: 5167e8d5417b sched/nohz: Rewrite and fix load-avg computation -- again not fully applied, missing the call to calc_load_exit_idle(). After that idle exit in sampling window will always be calculated to non-idle, and the load will be higher than normal. This patch adds the missing call to calc_load_exit_idle(). Signed-off-by: Charles Wang Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1345449754-27130-1-git-send-email-muming.wq@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 024540f97f7..3a9e5d5c109 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -573,6 +573,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) tick_do_update_jiffies64(now); update_cpu_load_nohz(); + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3-70-g09d2 From a4c96ae319b8047f62dedbe1eac79e321c185749 Mon Sep 17 00:00:00 2001 From: Peter Boonstoppel Date: Thu, 9 Aug 2012 15:34:47 -0700 Subject: sched: Unthrottle rt runqueues in __disable_runtime() migrate_tasks() uses _pick_next_task_rt() to get tasks from the real-time runqueues to be migrated. When rt_rq is throttled _pick_next_task_rt() won't return anything, in which case migrate_tasks() can't move all threads over and gets stuck in an infinite loop. Instead unthrottle rt runqueues before migrating tasks. Additionally: move unthrottle_offline_cfs_rqs() to rq_offline_fair() Signed-off-by: Peter Boonstoppel Signed-off-by: Peter Zijlstra Cc: Paul Turner Link: http://lkml.kernel.org/r/5FBF8E85CA34454794F0F7ECBA79798F379D3648B7@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 --- kernel/sched/fair.c | 7 +++++-- kernel/sched/rt.c | 1 + kernel/sched/sched.h | 1 - 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 207a81c769d..a4ea245f3d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5342,9 +5342,6 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; - /* Ensure any throttled groups are reachable by pick_next_task */ - unthrottle_offline_cfs_rqs(rq); - for ( ; ; ) { /* * There's this thread running, bail when that's the only diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c219bf8d704..86ad83c45da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_cancel(&cfs_b->slack_timer); } -void unthrottle_offline_cfs_rqs(struct rq *rq) +static void unthrottle_offline_cfs_rqs(struct rq *rq) { struct cfs_rq *cfs_rq; @@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -void unthrottle_offline_cfs_rqs(struct rq *rq) {} +static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #endif /* CONFIG_CFS_BANDWIDTH */ @@ -4956,6 +4956,9 @@ static void rq_online_fair(struct rq *rq) static void rq_offline_fair(struct rq *rq) { update_sysctl(); + + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 944cb68420e..e0b7ba9c040 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -691,6 +691,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f6714d009e7..0848fa36c38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1144,7 +1144,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void unthrottle_offline_cfs_rqs(struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -- cgit v1.2.3-70-g09d2 From 9450d57eab5cad36774c297da123062744472588 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 17:45:08 -0700 Subject: sched: Fix kernel-doc warnings in kernel/sched/fair.c Fix two kernel-doc warnings in kernel/sched/fair.c: Warning(kernel/sched/fair.c:3660): Excess function parameter 'cpus' description in 'update_sg_lb_stats' Warning(kernel/sched/fair.c:3806): Excess function parameter 'cpus' description in 'update_sd_lb_stats' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/50303714.3090204@xenotime.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86ad83c45da..42d9df6a5ca 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3658,7 +3658,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ @@ -3805,7 +3804,6 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. - * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ -- cgit v1.2.3-70-g09d2 From a6fa941d94b411bbd2b6421ffbde6db3c93e65ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Aug 2012 14:59:25 +0100 Subject: perf_event: Switch to internal refcount, fix race with close() Don't mess with file refcounts (or keep a reference to file, for that matter) in perf_event. Use explicit refcount of its own instead. Deal with the race between the final reference to event going away and new children getting created for it by use of atomic_long_inc_not_zero() in inherit_event(); just have the latter free what it had allocated and return NULL, that works out just fine (children of siblings of something doomed are created as singletons, same as if the child of leader had been created and immediately killed). Signed-off-by: Al Viro Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820135925.GG23464@ZenIV.linux.org.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 62 ++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7602ccb3f40..ad04dfcd6f3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -926,7 +926,7 @@ struct perf_event { struct hw_perf_event hw; struct perf_event_context *ctx; - struct file *filp; + atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d..efef4282b8e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6844,14 +6856,6 @@ inherit_event(struct perf_event *parent_event, add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - /* * Link this into the parent event's child list */ -- cgit v1.2.3-70-g09d2 From 500ad2d8b01390c98bc6dce068bccfa9534b8212 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 2 Aug 2012 13:46:35 +0530 Subject: perf/hwpb: Invoke __perf_event_disable() if interrupts are already disabled While debugging a warning message on PowerPC while using hardware breakpoints, it was discovered that when perf_event_disable is invoked through hw_breakpoint_handler function with interrupts disabled, a subsequent IPI in the code path would trigger a WARN_ON_ONCE message in smp_call_function_single function. This patch calls __perf_event_disable() when interrupts are already disabled, instead of perf_event_disable(). Reported-by: Edjunior Barbosa Machado Signed-off-by: K.Prasad [naveen.n.rao@linux.vnet.ibm.com: v3: Check to make sure we target current task] Signed-off-by: Naveen N. Rao Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120802081635.5811.17737.stgit@localhost.localdomain [ Fixed build error on MIPS. ] Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ad04dfcd6f3..33ed9d605f9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1296,6 +1296,7 @@ extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); +extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); #else static inline void @@ -1334,6 +1335,7 @@ static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } +static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index efef4282b8e..7fee567153f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry: /* * Cross CPU call to disable a performance event */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee1..9a7b487c6fe 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att int old_type = bp->attr.bp_type; int err = 0; - perf_event_disable(bp); + /* + * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it + * will not be possible to raise IPIs that invoke __perf_event_disable. + * So call the function directly after making sure we are targeting the + * current task. + */ + if (irqs_disabled() && bp->ctx && bp->ctx->task == current) + __perf_event_disable(bp); + else + perf_event_disable(bp); bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; -- cgit v1.2.3-70-g09d2 From 96e65306b81351b656835c15931d1d237b252f27 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/workqueue.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 692d97628a1..c462cd60c37 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1396,12 +1396,15 @@ retry: /* set REBIND and kick idle ones, we'll wait for these later */ for_each_worker_pool(pool, gcwq) { list_for_each_entry(worker, &pool->idle_list, entry) { + unsigned long worker_flags = worker->flags; + if (worker->flags & WORKER_REBIND) continue; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; idle_rebind.cnt++; worker->idle_rebind = &idle_rebind; @@ -1434,10 +1437,12 @@ retry: /* rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; - /* morph UNBOUND to REBIND */ - worker->flags &= ~WORKER_UNBOUND; - worker->flags |= WORKER_REBIND; + /* morph UNBOUND to REBIND atomically */ + worker_flags &= ~WORKER_UNBOUND; + worker_flags |= WORKER_REBIND; + ACCESS_ONCE(worker->flags) = worker_flags; if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(rebind_work))) -- cgit v1.2.3-70-g09d2 From 90beca5de591e12482a812f23a7f10690962ed4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:12:33 -0700 Subject: workqueue: move WORKER_REBIND clearing in rebind_workers() to the end of the function This doesn't make any functional difference and is purely to help the next patch to be simpler. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c462cd60c37..d79a18d0c42 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1422,19 +1422,7 @@ retry: goto retry; } - /* - * All idle workers are rebound and waiting for %WORKER_REBIND to - * be cleared inside idle_worker_rebind(). Clear and release. - * Clearing %WORKER_REBIND from this foreign context is safe - * because these workers are still guaranteed to be idle. - */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags &= ~WORKER_REBIND; - - wake_up_all(&gcwq->rebind_hold); - - /* rebind busy workers */ + /* all idle workers are rebound, rebind busy workers */ for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; unsigned long worker_flags = worker->flags; @@ -1454,6 +1442,18 @@ retry: worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } + + /* + * All idle workers are rebound and waiting for %WORKER_REBIND to + * be cleared inside idle_worker_rebind(). Clear and release. + * Clearing %WORKER_REBIND from this foreign context is safe + * because these workers are still guaranteed to be idle. + */ + for_each_worker_pool(pool, gcwq) + list_for_each_entry(worker, &pool->idle_list, entry) + worker->flags &= ~WORKER_REBIND; + + wake_up_all(&gcwq->rebind_hold); } static struct worker *alloc_worker(void) -- cgit v1.2.3-70-g09d2 From ec58815ab0409a921a7c9744eb4ca44866b14d71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Sep 2012 23:16:32 -0700 Subject: workqueue: fix possible deadlock in idle worker rebinding Currently, rebind_workers() and idle_worker_rebind() are two-way interlocked. rebind_workers() waits for idle workers to finish rebinding and rebound idle workers wait for rebind_workers() to finish rebinding busy workers before proceeding. Unfortunately, this isn't enough. The second wait from idle workers is implemented as follows. wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); rebind_workers() clears WORKER_REBIND, wakes up the idle workers and then returns. If CPU hotplug cycle happens again before one of the idle workers finishes the above wait_event(), rebind_workers() will repeat the first part of the handshake - set WORKER_REBIND again and wait for the idle worker to finish rebinding - and this leads to deadlock because the idle worker would be waiting for WORKER_REBIND to clear. This is fixed by adding another interlocking step at the end - rebind_workers() now waits for all the idle workers to finish the above WORKER_REBIND wait before returning. This ensures that all rebinding steps are complete on all idle workers before the next hotplug cycle can happen. This problem was diagnosed by Lai Jiangshan who also posted a patch to fix the issue, upon which this patch is based. This is the minimal fix and further patches are scheduled for the next merge window to simplify the CPU hotplug path. Signed-off-by: Tejun Heo Original-patch-by: Lai Jiangshan LKML-Reference: <1346516916-1991-3-git-send-email-laijs@cn.fujitsu.com> --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d79a18d0c42..dc7b8458e27 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1326,6 +1326,15 @@ static void idle_worker_rebind(struct worker *worker) /* we did our part, wait for rebind_workers() to finish up */ wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); + + /* + * rebind_workers() shouldn't finish until all workers passed the + * above WORKER_REBIND wait. Tell it when done. + */ + spin_lock_irq(&worker->pool->gcwq->lock); + if (!--worker->idle_rebind->cnt) + complete(&worker->idle_rebind->done); + spin_unlock_irq(&worker->pool->gcwq->lock); } /* @@ -1448,12 +1457,28 @@ retry: * be cleared inside idle_worker_rebind(). Clear and release. * Clearing %WORKER_REBIND from this foreign context is safe * because these workers are still guaranteed to be idle. + * + * We need to make sure all idle workers passed WORKER_REBIND wait + * in idle_worker_rebind() before returning; otherwise, workers can + * get stuck at the wait if hotplug cycle repeats. */ - for_each_worker_pool(pool, gcwq) - list_for_each_entry(worker, &pool->idle_list, entry) + idle_rebind.cnt = 1; + INIT_COMPLETION(idle_rebind.done); + + for_each_worker_pool(pool, gcwq) { + list_for_each_entry(worker, &pool->idle_list, entry) { worker->flags &= ~WORKER_REBIND; + idle_rebind.cnt++; + } + } wake_up_all(&gcwq->rebind_hold); + + if (--idle_rebind.cnt) { + spin_unlock_irq(&gcwq->lock); + wait_for_completion(&idle_rebind.done); + spin_lock_irq(&gcwq->lock); + } } static struct worker *alloc_worker(void) -- cgit v1.2.3-70-g09d2 From 552a37e9360a293cd20e7f8ff1fb326a244c5f1e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:33 -0700 Subject: workqueue: restore POOL_MANAGING_WORKERS This patch restores POOL_MANAGING_WORKERS which was replaced by pool->manager_mutex by 6037315269 "workqueue: use mutex for global_cwq manager exclusion". There's a subtle idle worker depletion bug across CPU hotplug events and we need to distinguish an actual manager and CPU hotplug preventing management. POOL_MANAGING_WORKERS will be used for the former and manager_mutex the later. This patch just lays POOL_MANAGING_WORKERS on top of the existing manager_mutex and doesn't introduce any synchronization changes. The next patch will update it. Note that this patch fixes a non-critical anomaly where too_many_workers() may return %true spuriously while CPU hotplug is in progress. While the issue could schedule idle timer spuriously, it didn't trigger any actual misbehavior. tj: Rewrote patch description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dc7b8458e27..383548ed0b5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -66,6 +66,7 @@ enum { /* pool flags */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ /* worker flags */ WORKER_STARTED = 1 << 0, /* started */ @@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_mutex); + bool managing = pool->flags & POOL_MANAGING_WORKERS; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1827,6 +1828,7 @@ static bool manage_workers(struct worker *worker) if (!mutex_trylock(&pool->manager_mutex)) return ret; + pool->flags |= POOL_MANAGING_WORKERS; pool->flags &= ~POOL_MANAGE_WORKERS; /* @@ -1836,6 +1838,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); + pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->manager_mutex); return ret; } -- cgit v1.2.3-70-g09d2 From ee378aa49b594da9bda6a2c768cc5b2ad585f911 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 10 Sep 2012 10:03:44 -0700 Subject: workqueue: fix possible idle worker depletion across CPU hotplug To simplify both normal and CPU hotplug paths, worker management is prevented while CPU hoplug is in progress. This is achieved by CPU hotplug holding the same exclusion mechanism used by workers to ensure there's only one manager per pool. If someone else seems to be performing the manager role, workers proceed to execute work items. CPU hotplug using the same mechanism can lead to idle worker depletion because all workers could proceed to execute work items while CPU hotplug is in progress and CPU hotplug itself wouldn't actually perform the worker management duty - it doesn't guarantee that there's an idle worker left when it releases management. This idle worker depletion, under extreme circumstances, can break forward-progress guarantee and thus lead to deadlock. This patch fixes the bug by using separate mechanisms for manager exclusion among workers and hotplug exclusion. For manager exclusion, POOL_MANAGING_WORKERS which was restored by the previous patch is used. pool->manager_mutex is now only used for exclusion between the elected manager and CPU hotplug. The elected manager won't proceed without holding pool->manager_mutex. This ensures that the worker which won the manager position can't skip managing while CPU hotplug is in progress. It will block on manager_mutex and perform management after CPU hotplug is complete. Note that hotplug may happen while waiting for manager_mutex. A manager isn't either on idle or busy list and thus the hoplug code can't unbind/rebind it. Make the manager handle its own un/rebinding. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 383548ed0b5..1e1373bcb3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1825,10 +1825,45 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (!mutex_trylock(&pool->manager_mutex)) + if (pool->flags & POOL_MANAGING_WORKERS) return ret; pool->flags |= POOL_MANAGING_WORKERS; + + /* + * To simplify both worker management and CPU hotplug, hold off + * management while hotplug is in progress. CPU hotplug path can't + * grab %POOL_MANAGING_WORKERS to achieve this because that can + * lead to idle worker depletion (all become busy thinking someone + * else is managing) which in turn can result in deadlock under + * extreme circumstances. Use @pool->manager_mutex to synchronize + * manager against CPU hotplug. + * + * manager_mutex would always be free unless CPU hotplug is in + * progress. trylock first without dropping @gcwq->lock. + */ + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { + spin_unlock_irq(&pool->gcwq->lock); + mutex_lock(&pool->manager_mutex); + /* + * CPU hotplug could have happened while we were waiting + * for manager_mutex. Hotplug itself can't handle us + * because manager isn't either on idle or busy list, and + * @gcwq's state and ours could have deviated. + * + * As hotplug is now excluded via manager_mutex, we can + * simply try to bind. It will succeed or fail depending + * on @gcwq's current state. Try it and adjust + * %WORKER_UNBOUND accordingly. + */ + if (worker_maybe_bind_and_lock(worker)) + worker->flags &= ~WORKER_UNBOUND; + else + worker->flags |= WORKER_UNBOUND; + + ret = true; + } + pool->flags &= ~POOL_MANAGE_WORKERS; /* -- cgit v1.2.3-70-g09d2 From ec145babe754f9ea1079034a108104b6001e001c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 19:26:03 -0400 Subject: time: Fix timeekeping_get_ns overflow on 32bit systems Daniel Lezcano reported seeing multi-second stalls from keyboard input on his T61 laptop when NOHZ and CPU_IDLE were enabled on a 32bit kernel. He bisected the problem down to commit 1e75fa8be9fb6 ("time: Condense timekeeper.xtime into xtime_sec"). After reproducing this issue, I narrowed the problem down to the fact that timekeeping_get_ns() returns a 64bit nsec value that hasn't been accumulated. In some cases this value was being then stored in timespec.tv_nsec (which is a long). On 32bit systems, with idle times larger then 4 seconds (or less, depending on the value of xtime_nsec), the returned nsec value would overflow 32bits. This limited kept time from increasing, causing timers to not expire. The fix is to make sure we don't directly store the result of timekeeping_get_ns() into a tv_nsec field, instead using a 64bit nsec value which can then be added into the timespec via timespec_add_ns(). Reported-and-bisected-by: Daniel Lezcano Tested-by: Daniel Lezcano Signed-off-by: John Stultz Acked-by: Prarit Bhargava Cc: Richard Cochran Link: http://lkml.kernel.org/r/1347405963-35715-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 34e5eac8142..d3b91e75cec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -303,10 +303,11 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsecs = timekeeping_get_ns(tk); } while (read_seqretry(&tk->lock, seq)); + ts->tv_nsec = 0; timespec_add_ns(ts, nsecs); } EXPORT_SYMBOL(getnstimeofday); @@ -345,6 +346,7 @@ void ktime_get_ts(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -352,13 +354,14 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); + ts->tv_sec += tomono.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec); } EXPORT_SYMBOL_GPL(ktime_get_ts); @@ -1244,6 +1247,7 @@ void get_monotonic_boottime(struct timespec *ts) { struct timekeeper *tk = &timekeeper; struct timespec tomono, sleep; + s64 nsec; unsigned int seq; WARN_ON(timekeeping_suspended); @@ -1251,14 +1255,15 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqbegin(&tk->lock); ts->tv_sec = tk->xtime_sec; - ts->tv_nsec = timekeeping_get_ns(tk); + nsec = timekeeping_get_ns(tk); tomono = tk->wall_to_monotonic; sleep = tk->total_sleep_time; } while (read_seqretry(&tk->lock, seq)); - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, - ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec); + ts->tv_sec += tomono.tv_sec + sleep.tv_sec; + ts->tv_nsec = 0; + timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); } EXPORT_SYMBOL_GPL(get_monotonic_boottime); -- cgit v1.2.3-70-g09d2 From 37407ea7f93864c2cfc03edf8f37872ec539ea2b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Sep 2012 12:29:43 -0700 Subject: Revert "sched: Improve scalability via 'CPU buddies', which withstand random perturbations" This reverts commit 970e178985cadbca660feb02f4d2ee3a09f7fdda. Nikolay Ulyanitsky reported thatthe 3.6-rc5 kernel has a 15-20% performance drop on PostgreSQL 9.2 on his machine (running "pgbench"). Borislav Petkov was able to reproduce this, and bisected it to this commit 970e178985ca ("sched: Improve scalability via 'CPU buddies' ...") apparently because the new single-idle-buddy model simply doesn't find idle CPU's to reschedule on aggressively enough. Mike Galbraith suspects that it is likely due to the user-mode spinlocks in PostgreSQL not reacting well to preemption, but we don't really know the details - I'll just revert the commit for now. There are hopefully other approaches to improve scheduler scalability without it causing these kinds of downsides. Reported-by: Nikolay Ulyanitsky Bisected-by: Borislav Petkov Acked-by: Mike Galbraith Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/sched/core.c | 39 +-------------------------------------- kernel/sched/fair.c | 28 +++++++++++++++++++++------- 3 files changed, 22 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c86648a2f..23bddac4bad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,6 @@ struct sched_domain { unsigned int smt_gain; int flags; /* See SD_* */ int level; - int idle_buddy; /* cpu assigned to select_idle_sibling() */ /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4ea245f3d8..649c9f876cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6014,11 +6014,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * - * Iterate domains and sched_groups downward, assigning CPUs to be - * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing - * due to random perturbation self canceling, ie sw buddies pull - * their counterpart to their CPU's hw counterpart. - * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6032,40 +6027,8 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) { - struct sched_domain *tmp = sd; - struct sched_group *sg, *prev; - bool right; - - /* - * Traverse to first CPU in group, and count hops - * to cpu from there, switching direction on each - * hop, never ever pointing the last CPU rightward. - */ - do { - id = cpumask_first(sched_domain_span(tmp)); - prev = sg = tmp->groups; - right = 1; - - while (cpumask_first(sched_group_cpus(sg)) != id) - sg = sg->next; - - while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { - prev = sg; - sg = sg->next; - right = !right; - } - - /* A CPU went down, never point back to domain start. */ - if (right && cpumask_first(sched_group_cpus(sg->next)) == id) - right = false; - - sg = right ? sg->next : prev; - tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); - } while ((tmp = tmp->child)); - + if (sd) id = cpumask_first(sched_domain_span(sd)); - } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42d9df6a5ca..96e2b18b628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; + struct sched_group *sg; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, check assigned siblings to find an elegible idle cpu. + * Otherwise, iterate the domains and find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) - continue; - if (idle_cpu(sd->idle_buddy)) - return sd->idle_buddy; - } + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; + for_each_cpu(i, sched_group_cpus(sg)) { + if (!idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; +next: + sg = sg->next; + } while (sg != sd->groups); + } +done: return target; } -- cgit v1.2.3-70-g09d2 From 579035dc5ddd6d48fd8529e7358b03d911ab9d8a Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Mon, 17 Sep 2012 14:09:12 -0700 Subject: pid-namespace: limit value of ns_last_pid to (0, max_pid) The kernel doesn't check the pid for negative values, so if you try to write -2 to /proc/sys/kernel/ns_last_pid, you will get a kernel panic. The crash happens because the next pid is -1, and alloc_pidmap() will try to access to a nonexistent pidmap. map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; Signed-off-by: Andrew Vagin Acked-by: Cyrill Gorcunov Acked-by: Oleg Nesterov Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b3c7fd55425..6144bab8fd8 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, */ tmp.data = ¤t->nsproxy->pid_ns->last_pid; - return proc_dointvec(&tmp, write, buffer, lenp, ppos); + return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); } +extern int pid_max; +static int zero = 0; static struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, + .extra1 = &zero, + .extra2 = &pid_max, }, { } }; -- cgit v1.2.3-70-g09d2 From 960bd11bf2daf669d0d910428fd9ef5a15c3d7cb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 17 Sep 2012 15:42:31 -0700 Subject: workqueue: always clear WORKER_REBIND in busy_worker_rebind_fn() busy_worker_rebind_fn() didn't clear WORKER_REBIND if rebinding failed (CPU is down again). This used to be okay because the flag wasn't used for anything else. However, after 25511a477 "workqueue: reimplement CPU online rebinding to handle idle workers", WORKER_REBIND is also used to command idle workers to rebind. If not cleared, the worker may confuse the next CPU_UP cycle by having REBIND spuriously set or oops / get stuck by prematurely calling idle_worker_rebind(). WARNING: at /work/os/wq/kernel/workqueue.c:1323 worker_thread+0x4cd/0x5 00() Hardware name: Bochs Modules linked in: test_wq(O-) Pid: 33, comm: kworker/1:1 Tainted: G O 3.6.0-rc1-work+ #3 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] worker_thread+0x4cd/0x500 [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 ---[ end trace e977cf20f4661968 ]--- BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] worker_thread+0x360/0x500 PGD 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: test_wq(O-) CPU 0 Pid: 33, comm: kworker/1:1 Tainted: G W O 3.6.0-rc1-work+ #3 Bochs Bochs RIP: 0010:[] [] worker_thread+0x360/0x500 RSP: 0018:ffff88001e1c9de0 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffff88001e633e00 RCX: 0000000000004140 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000009 RBP: ffff88001e1c9ea0 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000002 R11: 0000000000000000 R12: ffff88001fc8d580 R13: ffff88001fc8d590 R14: ffff88001e633e20 R15: ffff88001e1c6900 FS: 0000000000000000(0000) GS:ffff88001fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 00000000130e8000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process kworker/1:1 (pid: 33, threadinfo ffff88001e1c8000, task ffff88001e1c6900) Stack: ffff880000000000 ffff88001e1c9e40 0000000000000001 ffff88001e1c8010 ffff88001e519c78 ffff88001e1c9e58 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001e1c6900 ffff88001fc8d340 ffff88001fc8d340 Call Trace: [] kthread+0xbe/0xd0 [] kernel_thread_helper+0x4/0x10 Code: b1 00 f6 43 48 02 0f 85 91 01 00 00 48 8b 43 38 48 89 df 48 8b 00 48 89 45 90 e8 ac f0 ff ff 3c 01 0f 85 60 01 00 00 48 8b 53 50 <8b> 02 83 e8 01 85 c0 89 02 0f 84 3b 01 00 00 48 8b 43 38 48 8b RIP [] worker_thread+0x360/0x500 RSP CR2: 0000000000000000 There was no reason to keep WORKER_REBIND on failure in the first place - WORKER_UNBOUND is guaranteed to be set in such cases preventing incorrectly activating concurrency management. Always clear WORKER_REBIND. tj: Updated comment and description. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1e1373bcb3e..b80065a2450 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1349,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work) struct worker *worker = container_of(work, struct worker, rebind_work); struct global_cwq *gcwq = worker->pool->gcwq; - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_REBIND); + worker_maybe_bind_and_lock(worker); + + /* + * %WORKER_REBIND must be cleared even if the above binding failed; + * otherwise, we may confuse the next CPU_UP cycle or oops / get + * stuck by calling idle_worker_rebind() prematurely. If CPU went + * down again inbetween, %WORKER_UNBOUND would be set, so clearing + * %WORKER_REBIND is always safe. + */ + worker_clr_flags(worker, WORKER_REBIND); spin_unlock_irq(&gcwq->lock); } -- cgit v1.2.3-70-g09d2 From ed48ece27cd3d5ee0354c32bbaec0f3e1d4715c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Sep 2012 12:48:43 -0700 Subject: workqueue: reimplement work_on_cpu() using system_wq The existing work_on_cpu() implementation is hugely inefficient. It creates a new kthread, execute that single function and then let the kthread die on each invocation. Now that system_wq can handle concurrent executions, there's no advantage of doing this. Reimplement work_on_cpu() using system_wq which makes it simpler and way more efficient. stable: While this isn't a fix in itself, it's needed to fix a workqueue related bug in cpufreq/powernow-k8. AFAICS, this shouldn't break other existing users. Signed-off-by: Tejun Heo Acked-by: Jiri Kosina Cc: Linus Torvalds Cc: Bjorn Helgaas Cc: Len Brown Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b80065a2450..3c5a79e2134 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3576,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3602,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.2.3-70-g09d2