diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 265 |
1 files changed, 183 insertions, 82 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e9bd0b1fa9..9855e87d671 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (imb > old_imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: @@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an @@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) @@ -1497,7 +1535,7 @@ static void task_numa_placement(struct task_struct *p) /* If the task is part of a group prevent parallel updates to group stats */ if (p->numa_group) { group_lock = &p->numa_group->lock; - spin_lock(group_lock); + spin_lock_irq(group_lock); } /* Find the node with the highest number of faults */ @@ -1572,7 +1610,7 @@ static void task_numa_placement(struct task_struct *p) } } - spin_unlock(group_lock); + spin_unlock_irq(group_lock); } /* Preferred node as the node with the most faults */ @@ -1677,7 +1715,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - double_lock(&my_grp->lock, &grp->lock); + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { my_grp->faults[i] -= p->numa_faults_memory[i]; @@ -1691,7 +1730,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->nr_tasks++; spin_unlock(&my_grp->lock); - spin_unlock(&grp->lock); + spin_unlock_irq(&grp->lock); rcu_assign_pointer(p->numa_group, grp); @@ -1706,18 +1745,19 @@ no_join: void task_numa_free(struct task_struct *p) { struct numa_group *grp = p->numa_group; - int i; void *numa_faults = p->numa_faults_memory; + unsigned long flags; + int i; if (grp) { - spin_lock(&grp->lock); + spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] -= p->numa_faults_memory[i]; grp->total_faults -= p->total_numa_faults; list_del(&p->numa_entry); grp->nr_tasks--; - spin_unlock(&grp->lock); + spin_unlock_irqrestore(&grp->lock, flags); rcu_assign_pointer(p->numa_group, NULL); put_numa_group(grp); } @@ -1737,6 +1777,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1785,6 +1826,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1799,7 +1851,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) @@ -3128,7 +3180,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) */ if (!cfs_b->timer_active) { __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); } if (cfs_b->runtime > 0) { @@ -3300,14 +3352,14 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, false); raw_spin_unlock(&cfs_b->lock); } @@ -3351,7 +3403,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3689,7 +3741,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) } /* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) { /* * The timer may be active because we're trying to set a new bandwidth @@ -3704,7 +3756,7 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cpu_relax(); raw_spin_lock(&cfs_b->lock); /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) + if (!force && cfs_b->timer_active) return; } @@ -3883,7 +3935,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3943,7 +3995,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); @@ -4014,7 +4066,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } @@ -4448,10 +4500,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } @@ -4519,6 +4571,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ @@ -5069,6 +5124,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5082,21 +5138,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5111,16 +5175,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else @@ -5563,6 +5634,7 @@ static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5571,7 +5643,11 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ @@ -6639,27 +6715,62 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; - int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6669,20 +6780,20 @@ static int idle_balance(struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6694,41 +6805,37 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; - if (pulled_task) + update_next_balance(sd, 0, &next_balance); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + /* - * While browsing the domains, we released the rq lock. - * A task could have be enqueued in the meantime + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - goto out; - } - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - } - - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; -out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running && - (this_rq->dl.dl_nr_running || - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; if (pulled_task) { @@ -7009,16 +7116,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -7034,6 +7134,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); |