diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 537 |
1 files changed, 307 insertions, 230 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a725167898..3bdf01b494f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,6 +90,22 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +#ifdef smp_mb__before_atomic +void __smp_mb__before_atomic(void) +{ + smp_mb__before_atomic(); +} +EXPORT_SYMBOL(__smp_mb__before_atomic); +#endif + +#ifdef smp_mb__after_atomic +void __smp_mb__after_atomic(void) +{ + smp_mb__after_atomic(); +} +EXPORT_SYMBOL(__smp_mb__after_atomic); +#endif + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -506,6 +522,71 @@ static inline void init_hrtick(void) #endif /* CONFIG_SCHED_HRTICK */ /* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, val) \ +({ typeof(*(ptr)) __old, __val = *(ptr); \ + for (;;) { \ + __old = cmpxchg((ptr), __val, __val | (val)); \ + if (__old == __val) \ + break; \ + __val = __old; \ + } \ + __old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +/* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it @@ -521,18 +602,18 @@ void resched_task(struct task_struct *p) if (test_tsk_need_resched(p)) return; - set_tsk_need_resched(p); - cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); set_preempt_need_resched(); return; } - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) + if (set_nr_and_not_polling(p)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void resched_cpu(int cpu) @@ -595,27 +676,10 @@ static void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - /* - * This is safe, as this function is called with the timer - * wheel base lock of (cpu) held. When the CPU is on the way - * to idle and has not yet set rq->curr to idle then it will - * be serialized on the timer wheel base lock and take the new - * timer into account automatically. - */ - if (rq->curr != rq->idle) - return; - - /* - * We can set TIF_RESCHED on the idle task of the other CPU - * lockless. The worst case is that the other CPU runs the - * idle task through an additional NOOP schedule() - */ - set_tsk_need_resched(rq->idle); - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(rq->idle)) + if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } static bool wake_up_full_nohz_cpu(int cpu) @@ -841,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->clock_task += delta; #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) - if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) + if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) sched_rt_avg_update(rq, irq_delta + steal); #endif } @@ -1320,7 +1384,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } @@ -1474,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static void sched_ttwu_pending(void) +void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; + unsigned long flags; - raw_spin_lock(&rq->lock); + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); while (llist) { p = llist_entry(llist, struct task_struct, wake_entry); @@ -1488,7 +1556,7 @@ static void sched_ttwu_pending(void) ttwu_do_activate(rq, p, 0); } - raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&rq->lock, flags); } void scheduler_ipi(void) @@ -1534,8 +1602,14 @@ void scheduler_ipi(void) static void ttwu_queue_remote(struct task_struct *p, int cpu) { - if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) - smp_send_reschedule(cpu); + struct rq *rq = cpu_rq(cpu); + + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -2480,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes preempt_count_add(int val) +void preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2506,8 +2580,9 @@ void __kprobes preempt_count_add(int val) } } EXPORT_SYMBOL(preempt_count_add); +NOKPROBE_SYMBOL(preempt_count_add); -void __kprobes preempt_count_sub(int val) +void preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2528,6 +2603,7 @@ void __kprobes preempt_count_sub(int val) __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); +NOKPROBE_SYMBOL(preempt_count_sub); #endif @@ -2810,6 +2886,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) barrier(); } while (need_resched()); } +NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ @@ -3002,7 +3079,7 @@ EXPORT_SYMBOL(set_user_nice); int can_nice(const struct task_struct *p, const int nice) { /* convert nice value [19,-20] to rlimit style value [1,40] */ - int nice_rlim = 20 - nice; + int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); @@ -3026,17 +3103,10 @@ SYSCALL_DEFINE1(nice, int, increment) * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ - if (increment < -40) - increment = -40; - if (increment > 40) - increment = 40; - + increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); nice = task_nice(current) + increment; - if (nice < MIN_NICE) - nice = MIN_NICE; - if (nice > MAX_NICE) - nice = MAX_NICE; + nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) return -EPERM; @@ -3626,13 +3696,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr, */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); -out: - return ret; + return 0; err_size: put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; + return -E2BIG; } /** @@ -3685,7 +3753,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (retval) return retval; - if (attr.sched_policy < 0) + if ((int)attr.sched_policy < 0) return -EINVAL; rcu_read_lock(); @@ -3792,7 +3860,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, for (; addr < end; addr++) { if (*addr) - goto err_size; + return -EFBIG; } attr->size = usize; @@ -3802,12 +3870,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, if (ret) return -EFAULT; -out: - return ret; - -err_size: - ret = -E2BIG; - goto out; + return 0; } /** @@ -4084,6 +4147,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { + rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4102,15 +4166,18 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { + bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched) { + if (spin_needbreak(lock) || resched || need_rcu_resched) { spin_unlock(lock); if (resched) __cond_resched(); + else if (unlikely(need_rcu_resched)) + rcu_resched(); else cpu_relax(); ret = 1; @@ -4124,6 +4191,7 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); + rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); @@ -4178,7 +4246,7 @@ EXPORT_SYMBOL(yield); * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ -bool __sched yield_to(struct task_struct *p, bool preempt) +int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; @@ -5072,10 +5140,20 @@ static struct notifier_block migration_notifier = { .priority = CPU_PRI_MIGRATION, }; +static void __cpuinit set_cpu_rq_start_time(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + rq->age_stamp = sched_clock_cpu(cpu); +} + static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_cpu_rq_start_time(); + return NOTIFY_OK; case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -5194,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, } /* - * Even though we initialize ->power to something semi-sane, - * we leave power_orig unset. This allows us to detect if + * Even though we initialize ->capacity to something semi-sane, + * we leave capacity_orig unset. This allows us to detect if * domain iteration is still funny without causing /0 traps. */ - if (!group->sgp->power_orig) { + if (!group->sgc->capacity_orig) { printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); break; } @@ -5223,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->sgp->power != SCHED_POWER_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->sgp->power); + if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { + printk(KERN_CONT " (cpu_capacity = %d)", + group->sgc->capacity); } group = group->next; @@ -5283,8 +5360,9 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES)) { + SD_SHARE_CPUCAPACITY | + SD_SHARE_PKG_RESOURCES | + SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) return 0; } @@ -5313,9 +5391,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | - SD_SHARE_CPUPOWER | + SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + SD_PREFER_SIBLING | + SD_SHARE_POWERDOMAIN); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5437,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void) return rd; } -static void free_sched_groups(struct sched_group *sg, int free_sgp) +static void free_sched_groups(struct sched_group *sg, int free_sgc) { struct sched_group *tmp, *first; @@ -5448,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp) do { tmp = sg->next; - if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) - kfree(sg->sgp); + if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) + kfree(sg->sgc); kfree(sg); sg = tmp; @@ -5467,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu) if (sd->flags & SD_OVERLAP) { free_sched_groups(sd->groups, 1); } else if (atomic_dec_and_test(&sd->groups->ref)) { - kfree(sd->groups->sgp); + kfree(sd->groups->sgc); kfree(sd->groups); } kfree(sd); @@ -5589,17 +5668,6 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); -static const struct cpumask *cpu_cpu_mask(int cpu) -{ - return cpumask_of_node(cpu_to_node(cpu)); -} - -struct sd_data { - struct sched_domain **__percpu sd; - struct sched_group **__percpu sg; - struct sched_group_power **__percpu sgp; -}; - struct s_data { struct sched_domain ** __percpu sd; struct root_domain *rd; @@ -5612,21 +5680,6 @@ enum s_alloc { sa_none, }; -struct sched_domain_topology_level; - -typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); - -#define SDTL_OVERLAP 0x01 - -struct sched_domain_topology_level { - sched_domain_init_f init; - sched_domain_mask_f mask; - int flags; - int numa_level; - struct sd_data data; -}; - /* * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. @@ -5704,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, i); - if (atomic_inc_return(&sg->sgp->ref) == 1) + sg->sgc = *per_cpu_ptr(sdd->sgc, i); + if (atomic_inc_return(&sg->sgc->ref) == 1) build_group_mask(sd, sg); /* - * Initialize sgp->power such that even if we mess up the + * Initialize sgc->capacity such that even if we mess up the * domains and no possible iteration will get us here, we won't * die on a /0 trap. */ - sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); - sg->sgp->power_orig = sg->sgp->power; + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the @@ -5752,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) if (sg) { *sg = *per_cpu_ptr(sdd->sg, cpu); - (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); - atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); + atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ } return cpu; @@ -5762,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) /* * build_sched_groups will build a circular linked list of the groups * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_power to 0. + * and ->cpu_capacity to 0. * * Assumes the sched_domain tree is fully constructed */ @@ -5794,8 +5847,6 @@ build_sched_groups(struct sched_domain *sd, int cpu) continue; group = get_group(i, sdd, &sg); - cpumask_clear(sched_group_cpus(sg)); - sg->sgp->power = 0; cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { @@ -5818,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu) } /* - * Initialize sched groups cpu_power. + * Initialize sched groups cpu_capacity. * - * cpu_power indicates the capacity of sched group, which is used while + * cpu_capacity indicates the capacity of sched group, which is used while * distributing the load between different sched groups in a sched domain. - * Typically cpu_power for all the groups in a sched domain will be same unless - * there are asymmetries in the topology. If there are asymmetries, group - * having more cpu_power will pickup more load compared to the group having - * less cpu_power. + * Typically cpu_capacity for all the groups in a sched domain will be same + * unless there are asymmetries in the topology. If there are asymmetries, + * group having more cpu_capacity will pickup more load compared to the + * group having less cpu_capacity. */ -static void init_sched_groups_power(int cpu, struct sched_domain *sd) +static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; @@ -5841,13 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_balance_cpu(sg)) return; - update_group_power(sd, cpu); - atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); -} - -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; + update_group_capacity(sd, cpu); + atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } /* @@ -5855,34 +5901,6 @@ int __weak arch_sd_sibling_asym_packing(void) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ -#ifdef CONFIG_SCHED_DEBUG -# define SD_INIT_NAME(sd, type) sd->name = #type -#else -# define SD_INIT_NAME(sd, type) do { } while (0) -#endif - -#define SD_INIT_FUNC(type) \ -static noinline struct sched_domain * \ -sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ -{ \ - struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ - *sd = SD_##type##_INIT; \ - SD_INIT_NAME(sd, type); \ - sd->private = &tl->data; \ - return sd; \ -} - -SD_INIT_FUNC(CPU) -#ifdef CONFIG_SCHED_SMT - SD_INIT_FUNC(SIBLING) -#endif -#ifdef CONFIG_SCHED_MC - SD_INIT_FUNC(MC) -#endif -#ifdef CONFIG_SCHED_BOOK - SD_INIT_FUNC(BOOK) -#endif - static int default_relax_domain_level = -1; int sched_domain_level_max; @@ -5966,103 +5984,158 @@ static void claim_allocations(int cpu, struct sched_domain *sd) if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) - *per_cpu_ptr(sdd->sgp, cpu) = NULL; -} - -#ifdef CONFIG_SCHED_SMT -static const struct cpumask *cpu_smt_mask(int cpu) -{ - return topology_thread_cpumask(cpu); + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; } -#endif - -/* - * Topology list, bottom-up. - */ -static struct sched_domain_topology_level default_topology[] = { -#ifdef CONFIG_SCHED_SMT - { sd_init_SIBLING, cpu_smt_mask, }, -#endif -#ifdef CONFIG_SCHED_MC - { sd_init_MC, cpu_coregroup_mask, }, -#endif -#ifdef CONFIG_SCHED_BOOK - { sd_init_BOOK, cpu_book_mask, }, -#endif - { sd_init_CPU, cpu_cpu_mask, }, - { NULL, }, -}; - -static struct sched_domain_topology_level *sched_domain_topology = default_topology; - -#define for_each_sd_topology(tl) \ - for (tl = sched_domain_topology; tl->init; tl++) #ifdef CONFIG_NUMA - static int sched_domains_numa_levels; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; +#endif -static inline int sd_local_flags(int level) -{ - if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) - return 0; - - return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; -} +/* + * SD_flags allowed in topology descriptions. + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * + * Odd one out: + * SD_ASYM_PACKING - describes SMT quirks + */ +#define TOPOLOGY_SD_FLAGS \ + (SD_SHARE_CPUCAPACITY | \ + SD_SHARE_PKG_RESOURCES | \ + SD_NUMA | \ + SD_ASYM_PACKING | \ + SD_SHARE_POWERDOMAIN) static struct sched_domain * -sd_numa_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); - int level = tl->numa_level; - int sd_weight = cpumask_weight( - sched_domains_numa_masks[level][cpu_to_node(cpu)]); + int sd_weight, sd_flags = 0; + +#ifdef CONFIG_NUMA + /* + * Ugly hack to pass state to sd_numa_mask()... + */ + sched_domains_curr_level = tl->numa_level; +#endif + + sd_weight = cpumask_weight(tl->mask(cpu)); + + if (tl->sd_flags) + sd_flags = (*tl->sd_flags)(); + if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, + "wrong sd_flags in topology description\n")) + sd_flags &= ~TOPOLOGY_SD_FLAGS; *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, .imbalance_pct = 125, - .cache_nice_tries = 2, - .busy_idx = 3, - .idle_idx = 2, + + .cache_nice_tries = 0, + .busy_idx = 0, + .idle_idx = 0, .newidle_idx = 0, .wake_idx = 0, .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE - | 0*SD_BALANCE_EXEC - | 0*SD_BALANCE_FORK + | 1*SD_BALANCE_EXEC + | 1*SD_BALANCE_FORK | 0*SD_BALANCE_WAKE - | 0*SD_WAKE_AFFINE - | 0*SD_SHARE_CPUPOWER + | 1*SD_WAKE_AFFINE + | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES - | 1*SD_SERIALIZE + | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING - | 1*SD_NUMA - | sd_local_flags(level) + | 0*SD_NUMA + | sd_flags , + .last_balance = jiffies, .balance_interval = sd_weight, + .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, +#ifdef CONFIG_SCHED_DEBUG + .name = tl->name, +#endif }; - SD_INIT_NAME(sd, NUMA); - sd->private = &tl->data; /* - * Ugly hack to pass state to sd_numa_mask()... + * Convert topological properties into behaviour. */ - sched_domains_curr_level = tl->numa_level; + + if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->imbalance_pct = 110; + sd->smt_gain = 1178; /* ~15% */ + + } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + sd->imbalance_pct = 117; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + +#ifdef CONFIG_NUMA + } else if (sd->flags & SD_NUMA) { + sd->cache_nice_tries = 2; + sd->busy_idx = 3; + sd->idle_idx = 2; + + sd->flags |= SD_SERIALIZE; + if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + sd->flags &= ~(SD_BALANCE_EXEC | + SD_BALANCE_FORK | + SD_WAKE_AFFINE); + } + +#endif + } else { + sd->flags |= SD_PREFER_SIBLING; + sd->cache_nice_tries = 1; + sd->busy_idx = 2; + sd->idle_idx = 1; + } + + sd->private = &tl->data; return sd; } +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +struct sched_domain_topology_level *sched_domain_topology = default_topology; + +#define for_each_sd_topology(tl) \ + for (tl = sched_domain_topology; tl->mask; tl++) + +void set_sched_topology(struct sched_domain_topology_level *tl) +{ + sched_domain_topology = tl; +} + +#ifdef CONFIG_NUMA + static const struct cpumask *sd_numa_mask(int cpu) { return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; @@ -6206,7 +6279,10 @@ static void sched_init_numa(void) } } - tl = kzalloc((ARRAY_SIZE(default_topology) + level) * + /* Compute default topology size */ + for (i = 0; sched_domain_topology[i].mask; i++); + + tl = kzalloc((i + level + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -6214,18 +6290,19 @@ static void sched_init_numa(void) /* * Copy the default topology bits.. */ - for (i = 0; default_topology[i].init; i++) - tl[i] = default_topology[i]; + for (i = 0; sched_domain_topology[i].mask; i++) + tl[i] = sched_domain_topology[i]; /* * .. and append 'j' levels of NUMA goodness. */ for (j = 0; j < level; i++, j++) { tl[i] = (struct sched_domain_topology_level){ - .init = sd_numa_init, .mask = sd_numa_mask, + .sd_flags = cpu_numa_flags, .flags = SDTL_OVERLAP, .numa_level = j, + SD_INIT_NAME(NUMA) }; } @@ -6310,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sg) return -ENOMEM; - sdd->sgp = alloc_percpu(struct sched_group_power *); - if (!sdd->sgp) + sdd->sgc = alloc_percpu(struct sched_group_capacity *); + if (!sdd->sgc) return -ENOMEM; for_each_cpu(j, cpu_map) { struct sched_domain *sd; struct sched_group *sg; - struct sched_group_power *sgp; + struct sched_group_capacity *sgc; sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); @@ -6335,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), + sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); - if (!sgp) + if (!sgc) return -ENOMEM; - *per_cpu_ptr(sdd->sgp, j) = sgp; + *per_cpu_ptr(sdd->sgc, j) = sgc; } } @@ -6367,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); - if (sdd->sgp) - kfree(*per_cpu_ptr(sdd->sgp, j)); + if (sdd->sgc) + kfree(*per_cpu_ptr(sdd->sgc, j)); } free_percpu(sdd->sd); sdd->sd = NULL; free_percpu(sdd->sg); sdd->sg = NULL; - free_percpu(sdd->sgp); - sdd->sgp = NULL; + free_percpu(sdd->sgc); + sdd->sgc = NULL; } } @@ -6383,7 +6460,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = tl->init(tl, cpu); + struct sched_domain *sd = sd_init(tl, cpu); if (!sd) return child; @@ -6445,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map, } } - /* Calculate CPU power for physical packages and nodes */ + /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { if (!cpumask_test_cpu(i, cpu_map)) continue; for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { claim_allocations(i, sd); - init_sched_groups_power(i, sd); + init_sched_groups_capacity(i, sd); } } @@ -6895,7 +6972,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_POWER_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -6953,6 +7030,7 @@ void __init sched_init(void) if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); + set_cpu_rq_start_time(); #endif init_sched_fair_class(); @@ -7620,7 +7698,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css_parent(css)); + struct task_group *parent = css_tg(css->parent); if (parent) sched_online_group(tg, parent); @@ -7751,8 +7829,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) /* restart the period timer (if active) to handle new period expiry */ if (runtime_enabled && cfs_b->timer_active) { /* force a reprogram */ - cfs_b->timer_active = 0; - __start_cfs_bandwidth(cfs_b); + __start_cfs_bandwidth(cfs_b, true); } raw_spin_unlock_irq(&cfs_b->lock); |