From eeb61e53ea19be0c4015b00b2e8b3b2185436f2b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Oct 2014 14:18:25 +0400 Subject: sched: Fix race between task_group and sched_task_group The race may happen when somebody is changing task_group of a forking task. Child's cgroup is the same as parent's after dup_task_struct() (there just memory copying). Also, cfs_rq and rt_rq are the same as parent's. But if parent changes its task_group before it's called cgroup_post_fork(), we do not reflect this situation on child. Child's cfs_rq and rt_rq remain the same, while child's task_group changes in cgroup_post_fork(). To fix this we introduce fork() method, which calls sched_move_task() directly. This function changes sched_task_group on appropriate (also its logic has no problem with freshly created tasks, so we shouldn't introduce something special; we are able just to use it). Possibly, this decides the Burke Libbey's problem: https://lkml.org/lkml/2014/10/24/456 Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414405105.19914.169.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44999505e1b..dde8adb7d0c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7833,6 +7833,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } +static void cpu_cgroup_fork(struct task_struct *task) +{ + sched_move_task(task); +} + static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { @@ -8205,6 +8210,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, .css_offline = cpu_cgroup_css_offline, + .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, -- cgit v1.2.3-70-g09d2 From 64be6f1f5f710f5995d41caf8a1767fe6d2b5a87 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:37 +0100 Subject: sched/deadline: Don't replenish from a !SCHED_DEADLINE entity In the deboost path, right after the dl_boosted flag has been reset, we can currently end up replenishing using -deadline parameters of a !SCHED_DEADLINE entity. This of course causes a bug, as those parameters are empty. In the case depicted above it is safe to simply bail out, as the deboosted task is going to be back to its original scheduling class anyway. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Link: http://lkml.kernel.org/r/1414142198-18552-4-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 256e577faf1..92279eaf0ef 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -847,8 +847,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * smaller than our one... OTW we keep our runtime and * deadline. */ - if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) + if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) { pi_se = &pi_task->dl; + } else if (!dl_prio(p->normal_prio)) { + /* + * Special case in which we have a !SCHED_DEADLINE task + * that is going to be deboosted, but exceedes its + * runtime while doing so. No point in replenishing + * it, as it's going to return back to its original + * scheduling class after this. + */ + BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + return; + } /* * If p is throttled, we do nothing. In fact, if it exhausted -- cgit v1.2.3-70-g09d2 From aee38ea95419c818dfdde52b115aeffe9cbb259b Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 24 Oct 2014 10:16:38 +0100 Subject: sched/deadline: Fix races between rt_mutex_setprio() and dl_task_timer() dl_task_timer() is racy against several paths. Daniel noticed that the replenishment timer may experience a race condition against an enqueue_dl_entity() called from rt_mutex_setprio(). With his own words: rt_mutex_setprio() resets p->dl.dl_throttled. So the pattern is: start_dl_timer() throttled = 1, rt_mutex_setprio() throlled = 0, sched_switch() -> enqueue_task(), dl_task_timer-> enqueue_task() throttled is 0 => BUG_ON(on_dl_rq(dl_se)) fires as the scheduling entity is already enqueued on the -deadline runqueue. As we do for the other races, we just bail out in the replenishment timer code. Reported-by: Daniel Wagner Tested-by: Daniel Wagner Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: vincent@legout.info Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414142198-18552-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 92279eaf0ef..46167899d85 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -518,12 +518,20 @@ again: } /* - * We need to take care of a possible races here. In fact, the - * task might have changed its scheduling policy to something - * different from SCHED_DEADLINE or changed its reservation - * parameters (through sched_setattr()). + * We need to take care of several possible races here: + * + * - the task might have changed its scheduling policy + * to something different than SCHED_DEADLINE + * - the task might have changed its reservation parameters + * (through sched_setattr()) + * - the task might have been boosted by someone else and + * might be in the boosting/deboosting path + * + * In all this cases we bail out, as the task is already + * in the runqueue or is going to be enqueued back anyway. */ - if (!dl_task(p) || dl_se->dl_new) + if (!dl_task(p) || dl_se->dl_new || + dl_se->dl_boosted || !dl_se->dl_throttled) goto unlock; sched_clock_tick(); -- cgit v1.2.3-70-g09d2 From 1effd9f19324efb05fccc7421530e11a52db0278 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 22 Oct 2014 11:17:11 +0400 Subject: sched/numa: Fix unsafe get_task_struct() in task_numa_assign() Unlocked access to dst_rq->curr in task_numa_compare() is racy. If curr task is exiting this may be a reason of use-after-free: task_numa_compare() do_exit() ... current->flags |= PF_EXITING; ... release_task() ... ~~delayed_put_task_struct()~~ ... schedule() rcu_read_lock() ... cur = ACCESS_ONCE(dst_rq->curr) ... ... rq->curr = next; ... context_switch() ... finish_task_switch() ... put_task_struct() ... __put_task_struct() ... free_task_struct() task_numa_assign() ... get_task_struct() ... As noted by Oleg: <task_numa_assign() path does get_task_struct(dst_rq->curr) and this is not safe. The task_struct itself can't go away, but rcu_read_lock() can't save us from the final put_task_struct() in finish_task_switch(); this reference goes away without rcu gp>> The patch provides simple check of PF_EXITING flag. If it's not set, this guarantees that call_rcu() of delayed_put_task_struct() callback hasn't happened yet, so we can safely do get_task_struct() in task_numa_assign(). Locked dst_rq->lock protects from concurrency with the last schedule(). Reusing or unmapping of cur's memory may happen without it. Suggested-by: Oleg Nesterov Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413962231.19914.130.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b069bf3e70..fbc0b8214af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1164,9 +1164,19 @@ static void task_numa_compare(struct task_numa_env *env, long moveimp = imp; rcu_read_lock(); - cur = ACCESS_ONCE(dst_rq->curr); - if (cur->pid == 0) /* idle */ + + raw_spin_lock_irq(&dst_rq->lock); + cur = dst_rq->curr; + /* + * No need to move the exiting task, and this ensures that ->curr + * wasn't reaped and thus get_task_struct() in task_numa_assign() + * is safe under RCU read lock. + * Note that rcu_read_lock() itself can't protect from the final + * put_task_struct() after the last schedule(). + */ + if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + raw_spin_unlock_irq(&dst_rq->lock); /* * "imp" is the fault differential for the source task between the -- cgit v1.2.3-70-g09d2 From 2847c90e1b3ae95379af24894fc4f98e7f2fd705 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Wed, 22 Oct 2014 16:04:35 +0900 Subject: sched/fair: Care divide error in update_task_scan_period() While offling node by hot removing memory, the following divide error occurs: divide error: 0000 [#1] SMP [...] Call Trace: [...] handle_mm_fault [...] ? try_to_wake_up [...] ? wake_up_state [...] __do_page_fault [...] ? do_futex [...] ? put_prev_entity [...] ? __switch_to [...] do_page_fault [...] page_fault [...] RIP [] task_numa_fault RSP The issue occurs as follows: 1. When page fault occurs and page is allocated from node 1, task_struct->numa_faults_buffer_memory[] of node 1 is incremented and p->numa_faults_locality[] is also incremented as follows: o numa_faults_buffer_memory[] o numa_faults_locality[] NR_NUMA_HINT_FAULT_TYPES | 0 | 1 | ---------------------------------- ---------------------- node 0 | 0 | 0 | remote | 0 | node 1 | 0 | 1 | locale | 1 | ---------------------------------- ---------------------- 2. node 1 is offlined by hot removing memory. 3. When page fault occurs, fault_types[] is calculated by using p->numa_faults_buffer_memory[] of all online nodes in task_numa_placement(). But node 1 was offline by step 2. So the fault_types[] is calculated by using only p->numa_faults_buffer_memory[] of node 0. So both of fault_types[] are set to 0. 4. The values(0) of fault_types[] pass to update_task_scan_period(). 5. numa_faults_locality[1] is set to 1. So the following division is calculated. static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private){ ... ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); } 6. But both of private and shared are set to 0. So divide error occurs here. The divide error is rare case because the trigger is node offline. This patch always increments denominator for avoiding divide error. Signed-off-by: Yasuaki Ishimatsu Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/54475703.8000505@jp.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbc0b8214af..e9abd4e4c5c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1530,7 +1530,7 @@ static void update_task_scan_period(struct task_struct *p, * scanning faster if shared accesses dominate as it may * simply bounce migrations uselessly */ - ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); + ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1)); diff = (diff * ratio) / NUMA_PERIOD_SLOTS; } -- cgit v1.2.3-70-g09d2 From 6419265899d9bd27e5ff9f8b43db3715407fc2ba Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 16 Oct 2014 14:39:37 +0400 Subject: sched/fair: Fix division by zero sysctl_numa_balancing_scan_size File /proc/sys/kernel/numa_balancing_scan_size_mb allows writing of zero. This bash command reproduces problem: $ while :; do echo 0 > /proc/sys/kernel/numa_balancing_scan_size_mb; \ echo 256 > /proc/sys/kernel/numa_balancing_scan_size_mb; done divide error: 0000 [#1] SMP Modules linked in: CPU: 0 PID: 24112 Comm: bash Not tainted 3.17.0+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff88013c852600 ti: ffff880037a68000 task.ti: ffff880037a68000 RIP: 0010:[] [] task_scan_min+0x21/0x50 RSP: 0000:ffff880037a6bce0 EFLAGS: 00010246 RAX: 0000000000000a00 RBX: 00000000000003e8 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88013c852600 RBP: ffff880037a6bcf0 R08: 0000000000000001 R09: 0000000000015c90 R10: ffff880239bf6c00 R11: 0000000000000016 R12: 0000000000003fff R13: ffff88013c852600 R14: ffffea0008d1b000 R15: 0000000000000003 FS: 00007f12bb048700(0000) GS:ffff88007da00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000001505678 CR3: 0000000234770000 CR4: 00000000000006f0 Stack: ffff88013c852600 0000000000003fff ffff880037a6bd18 ffffffff810741d1 ffff88013c852600 0000000000003fff 000000000002bfff ffff880037a6bda8 ffffffff81077ef7 ffffea0008a56d40 0000000000000001 0000000000000001 Call Trace: [] task_scan_max+0x11/0x40 [] task_numa_fault+0x1f7/0xae0 [] ? migrate_misplaced_page+0x276/0x300 [] handle_mm_fault+0x62d/0xba0 [] __do_page_fault+0x191/0x510 [] ? native_smp_send_reschedule+0x42/0x60 [] ? check_preempt_curr+0x80/0xa0 [] ? wake_up_new_task+0x11c/0x1a0 [] ? do_fork+0x14d/0x340 [] ? get_unused_fd_flags+0x2b/0x30 [] ? __fd_install+0x1f/0x60 [] do_page_fault+0xc/0x10 [] page_fault+0x22/0x30 RIP [] task_scan_min+0x21/0x50 RSP ---[ end trace 9a826d16936c04de ]--- Also fix race in task_scan_min (it depends on compiler behaviour). Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Tomlin Cc: Andrew Morton Cc: Dario Faggioli Cc: David Rientjes Cc: Jens Axboe Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Rik van Riel Link: http://lkml.kernel.org/r/1413455977.24793.78.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +++-- kernel/sysctl.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9abd4e4c5c..34baa60f8a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -828,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) static unsigned int task_scan_min(struct task_struct *p) { + unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size); unsigned int scan, floor; unsigned int windows = 1; - if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) - windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; floor = 1000 / windows; scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aada6d9fe7..15f2511a1b7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = { .data = &sysctl_numa_balancing_scan_size, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, }, { .procname = "numa_balancing", -- cgit v1.2.3-70-g09d2 From 009f60e2763568cdcd75bd1cf360c7c7165e2e60 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 5 Oct 2014 22:23:22 +0200 Subject: sched: stop the unbound recursion in preempt_schedule_context() preempt_schedule_context() does preempt_enable_notrace() at the end and this can call the same function again; exception_exit() is heavy and it is quite possible that need-resched is true again. 1. Change this code to dec preempt_count() and check need_resched() by hand. 2. As Linus suggested, we can use the PREEMPT_ACTIVE bit and avoid the enable/disable dance around __schedule(). But in this case we need to move into sched/core.c. 3. Cosmetic, but x86 forgets to declare this function. This doesn't really matter because it is only called by asm helpers, still it make sense to add the declaration into asm/preempt.h to match preempt_schedule(). Reported-by: Sasha Levin Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Graf Cc: Andrew Morton Cc: Christoph Lameter Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Steven Rostedt Cc: Peter Anvin Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Chuck Ebbert Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/20141005202322.GB27962@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 1 + kernel/context_tracking.c | 40 ---------------------------------------- kernel/sched/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 7024c12f7bf..400873450e3 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -105,6 +105,7 @@ static __always_inline bool should_resched(void) # ifdef CONFIG_CONTEXT_TRACKING extern asmlinkage void ___preempt_schedule_context(void); # define __preempt_schedule_context() asm ("call ___preempt_schedule_context") + extern asmlinkage void preempt_schedule_context(void); # endif #endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 5664985c46a..937ecdfdf25 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -107,46 +107,6 @@ void context_tracking_user_enter(void) } NOKPROBE_SYMBOL(context_tracking_user_enter); -#ifdef CONFIG_PREEMPT -/** - * preempt_schedule_context - preempt_schedule called by tracing - * - * The tracing infrastructure uses preempt_enable_notrace to prevent - * recursion and tracing preempt enabling caused by the tracing - * infrastructure itself. But as tracing can happen in areas coming - * from userspace or just about to enter userspace, a preempt enable - * can occur before user_exit() is called. This will cause the scheduler - * to be called when the system is still in usermode. - * - * To prevent this, the preempt_enable_notrace will use this function - * instead of preempt_schedule() to exit user context if needed before - * calling the scheduler. - */ -asmlinkage __visible void __sched notrace preempt_schedule_context(void) -{ - enum ctx_state prev_ctx; - - if (likely(!preemptible())) - return; - - /* - * Need to disable preemption in case user_exit() is traced - * and the tracer calls preempt_enable_notrace() causing - * an infinite recursion. - */ - preempt_disable_notrace(); - prev_ctx = exception_enter(); - preempt_enable_no_resched_notrace(); - - preempt_schedule(); - - preempt_disable_notrace(); - exception_exit(prev_ctx); - preempt_enable_notrace(); -} -EXPORT_SYMBOL_GPL(preempt_schedule_context); -#endif /* CONFIG_PREEMPT */ - /** * context_tracking_user_exit - Inform the context tracking that the CPU is * exiting userspace mode and entering the kernel. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dde8adb7d0c..240157c13dd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2951,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); + +#ifdef CONFIG_CONTEXT_TRACKING +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +asmlinkage __visible void __sched notrace preempt_schedule_context(void) +{ + enum ctx_state prev_ctx; + + if (likely(!preemptible())) + return; + + do { + __preempt_count_add(PREEMPT_ACTIVE); + /* + * Needs preempt disabled in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + prev_ctx = exception_enter(); + __schedule(); + exception_exit(prev_ctx); + + __preempt_count_sub(PREEMPT_ACTIVE); + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_CONTEXT_TRACKING */ + #endif /* CONFIG_PREEMPT */ /* -- cgit v1.2.3-70-g09d2 From fcd964dda5ece2fa77f78f843bc3455348787282 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 7 Oct 2014 17:29:07 +0800 Subject: sched: Update comments for CLONE_NEWNS Signed-off-by: Chen Hanxiao Acked-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1412674147-8941-1-git-send-email-chenhanxiao@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 34f9d7387d1..b932be9f5c5 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -13,7 +13,7 @@ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ -- cgit v1.2.3-70-g09d2 From f3a7e1a9c464a32ee186ab91388313c82e7ce018 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Oct 2014 20:35:56 +0400 Subject: sched/dl: Fix preemption checks 1) switched_to_dl() check is wrong. We reschedule only if rq->curr is deadline task, and we do not reschedule if it's a lower priority task. But we must always preempt a task of other classes. 2) dl_task_timer(): Policy does not change in case of priority inheritance. rt_mutex_setprio() changes prio, while policy remains old. So we lose some balancing logic in dl_task_timer() and switched_to_dl() when we check policy instead of priority. Boosted task may be rq->curr. (I didn't change switched_from_dl() because no check is necessary there at all). I've looked at this place(switched_to_dl) several times and even fixed this function, but found just now... I suppose some performance tests may work better after this. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413909356.19914.128.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 46167899d85..5285332392d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -540,7 +540,7 @@ again: dl_se->dl_yielded = 0; if (task_on_rq_queued(p)) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); - if (task_has_dl_policy(rq->curr)) + if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); @@ -1626,8 +1626,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ - if (check_resched && task_has_dl_policy(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + if (check_resched) { + if (dl_task(rq->curr)) + check_preempt_curr_dl(rq, p, 0); + else + resched_curr(rq); + } } } -- cgit v1.2.3-70-g09d2