diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 113 |
1 files changed, 86 insertions, 27 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe..22321db6495 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } @@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; unsigned int flags; @@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -4267,6 +4281,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4284,7 +4299,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4296,14 +4317,52 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } |