From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 481a13c43b1..d3bf660cb57 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.2.3-70-g09d2 From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 78157099b16..9b4c4f32013 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.2.3-70-g09d2 From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 25 Feb 2014 19:52:23 +0400 Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration In deadline class we do not have group scheduling. So, let's remove unnecessary X = X; equations. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra Cc: Juri Lelli Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 15cbc17fbf8..aecf93030e0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq) static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory++; @@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { struct task_struct *p = dl_task_of(dl_se); - dl_rq = &rq_of_dl_rq(dl_rq)->dl; if (p->nr_cpus_allowed > 1) dl_rq->dl_nr_migratory--; -- cgit v1.2.3-70-g09d2 From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 24 Feb 2014 11:47:12 +0100 Subject: sched/deadline: Switch CPU's presence test order Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly") changed how we check if a CPU returned by cpudeadline machinery is valid. But, we don't want to call cpu_present() if best_cpu is equal to -1. So, switch the order of tests inside WARN_ON(). Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: boris.ostrovsky@oracle.com Cc: konrad.wilk@oracle.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cpudeadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b8838b56d1..5b9bb42b2d4 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx) static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) { - WARN_ON(!cpu_present(idx) || idx == IDX_INVALID); + WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); if (dl_time_before(new_dl, cp->elements[idx].dl)) { cp->elements[idx].dl = new_dl; @@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, } out: - WARN_ON(!cpu_present(best_cpu) && best_cpu != -1); + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); return best_cpu; } -- cgit v1.2.3-70-g09d2 From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 21 Feb 2014 11:37:15 +0100 Subject: sched/deadline: Prevent rt_time growth to infinity Kirill Tkhai noted: Since deadline tasks share rt bandwidth, we must care about bandwidth timer set. Otherwise rt_time may grow up to infinity in update_curr_dl(), if there are no other available RT tasks on top level bandwidth. RT task were in fact throttled right after they got enqueued, and never executed again (rt_time never again went below rt_runtime). Peter then proposed to accrue DL execution on rt_time only when rt timer is active, and proposed a patch (this patch is a slight modification of that) to implement that behavior. While this solves Kirill problem, it has a drawback. Indeed, Kirill noted again: It looks we may get into a situation, when all CPU time is shared between RT and DL tasks: rt_runtime = n rt_period = 2n | RT working, DL sleeping | DL working, RT sleeping | ----------------------------------------------------------- | (1) duration = n | (2) duration = n | (repeat) |--------------------------|------------------------------| | (rt_bw timer is running) | (rt_bw timer is not running) | No time for fair tasks at all. While this can happen during the first period, if rq is always backlogged, RT tasks won't have the opportunity to execute anymore: rt_time reached rt_runtime during (1), suppose after (2) RT is enqueued back, it gets throttled since rt timer didn't fire, replenishment is from now on eaten up by DL tasks that accrue their execution on rt_time (while rt timer is active - we have an RT task waiting for replenishment). FAIR tasks are not touched after this first period. Ok, this is not ideal, and the situation is even worse! What above (the nice case), practically never happens in reality, where your rt timer is not aligned to tasks periods, tasks are in general not periodic, etc.. Long story short, you always risk to overload your system. This patch is based on Peter's idea, but exploits an additional fact: if you don't have RT tasks enqueued, it makes little sense to continue incrementing rt_time once you reached the upper limit (DL tasks have their own mechanism for throttling). This cures both problems: - no matter how many DL instances in the past, you'll have an rt_time slightly above rt_runtime when an RT task is enqueued, and from that point on (after the first replenishment), the task will normally execute; - you can still eat up all bandwidth during the first period, but not anymore after that, remember that DL execution will increment rt_time till the upper limit is reached. The situation is still not perfect! But, we have a simple solution for now, that limits how much you can jeopardize your system, as we keep working towards the right answer: RT groups scheduled using deadline servers. Reported-by: Kirill Tkhai Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 ++++++-- kernel/sched/rt.c | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index aecf93030e0..6e79b3faa4c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se) return 1; } +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + /* * Update the current task's runtime statistics (provided it is still * a -deadline task and has not been removed from the dl_rq). @@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq) struct rt_rq *rt_rq = &rq->rt; raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; /* * We'll let actual RT tasks worry about the overflow here, we - * have our own CBS to keep us inline -- see above. + * have our own CBS to keep us inline; only account when RT + * bandwidth is relevant. */ + if (sched_rt_bandwidth_account(rt_rq)) + rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a2740b775b4..1999021042c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ +bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + return (hrtimer_active(&rt_b->rt_period_timer) || + rt_rq->rt_time < rt_b->rt_runtime); +} + #ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. -- cgit v1.2.3-70-g09d2 From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Thu, 27 Feb 2014 17:10:12 +0530 Subject: genirq: Include missing header file in irqdomain.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Include appropriate header file include/linux/of_irq.h in kernel/irq/irqdomain.c because it contains prototype definition of function define in kernel/irq/irqdomain.c. This eliminates the following warning in kernel/irq/irqdomain.c: kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Cc: Benjamin Herrenschmidt Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com Signed-off-by: Thomas Gleixner --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf68bb36fe5..f14033700c2 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:03 +0800 Subject: cpuset: fix a locking issue in cpuset_migrate_mm() I can trigger a lockdep warning: # mount -t cgroup -o cpuset xxx /cgroup # mkdir /cgroup/cpuset # mkdir /cgroup/tmp # echo 0 > /cgroup/tmp/cpuset.cpus # echo 0 > /cgroup/tmp/cpuset.mems # echo 1 > /cgroup/tmp/cpuset.memory_migrate # echo $$ > /cgroup/tmp/tasks # echo 1 > /cgruop/tmp/cpuset.mems =============================== [ INFO: suspicious RCU usage. ] 3.14.0-rc1-0.1-default+ #32 Not tainted ------------------------------- include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage! ... [] dump_stack+0x72/0x86 [] lockdep_rcu_suspicious+0x101/0x140 [] cpuset_migrate_mm+0xb1/0xe0 ... We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now we hold cpuset_mutex, which causes task_css() to complain. This is not a false-positive but a real issue. Holding cpuset_mutex won't prevent a task from migrating to another cpuset, and it won't prevent the original task->cgroup from destroying during this change. Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking) Cc: # 3.9+ Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cpuset.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f..dba9e4aef69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * - * Call holding cpuset_mutex, so current's cpuset won't change - * during this call, as manage_mutex holds off any cpuset_attach() - * calls. Therefore we don't need to take task_lock around the - * call to guarantee_online_mems(), as we know no one is changing - * our task's cpuset. - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + rcu_read_lock(); mems_cs = effective_nodemask_cpuset(task_cs(tsk)); guarantee_online_mems(mems_cs, &tsk->mems_allowed); + rcu_read_unlock(); } /* -- cgit v1.2.3-70-g09d2 From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Cc: Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index dba9e4aef69..e6b1b66afe5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.2.3-70-g09d2 From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: stable@vger.kernel.org # 2.6.31+ Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 6 ++++++ kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index accc497f8d7..7159a0a933d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -60,6 +60,12 @@ struct tp_module { unsigned int num_tracepoints; struct tracepoint * const *tracepoints_ptrs; }; +bool trace_module_has_bad_taint(struct module *mod); +#else +static inline bool trace_module_has_bad_taint(struct module *mod) +{ + return false; +} #endif /* CONFIG_MODULES */ struct tracepoint_iter { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e71ffd4eccb..f3989ceb5cd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c..031cc5655a5 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3-70-g09d2