diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1871 |
1 files changed, 1079 insertions, 792 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 054a6012de9..18d38e4ec7b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,8 +75,11 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -252,6 +255,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -267,25 +272,18 @@ struct task_group { struct task_group *parent; struct list_head siblings; struct list_head children; -}; -#define root_task_group init_task_group +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif +}; -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return list_empty(&root_task_group.children); -} -#endif - -# define INIT_TASK_GROUP_LOAD NICE_0_LOAD +# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD /* * A weight of 0 or 1 can cause arithmetics problems. @@ -298,59 +296,13 @@ static int root_task_group_empty(void) #define MIN_SHARES 2 #define MAX_SHARES (1UL << 18) -static int init_task_group_load = INIT_TASK_GROUP_LOAD; +static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group; - -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_CGROUP_SCHED - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ - /* - * Strictly speaking this rcu_read_lock() is not needed since the - * task_group is tied to the cgroup, which in turn can never go away - * as long as there are tasks attached to it. - * - * However since task_group() uses task_subsys_state() which is an - * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. - */ - rcu_read_lock(); -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif - rcu_read_unlock(); -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} +struct task_group root_task_group; #endif /* CONFIG_CGROUP_SCHED */ @@ -387,6 +339,7 @@ struct cfs_rq { * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ + int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ @@ -405,14 +358,17 @@ struct cfs_rq { unsigned long h_load; /* - * this cpu's part of tg->shares + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -471,9 +427,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -482,7 +436,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. @@ -502,9 +456,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -532,11 +487,12 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -544,6 +500,8 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; @@ -562,6 +520,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -591,26 +553,13 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) -{ - rq->curr->sched_class->check_preempt_curr(rq, p, flags); - /* - * A queue event has occurred, and we're going to schedule. In - * this case, we can save a useless back to back clock update. - */ - if (test_tsk_need_resched(p)) - rq->skip_clock_update = 1; -} +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); static inline int cpu_of(struct rq *rq) { @@ -642,10 +591,67 @@ static inline int cpu_of(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) -inline void update_rq_clock(struct rq *rq) +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + struct cgroup_subsys_state *css; + + if (p->flags & PF_EXITING) + return &root_task_group; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&task_rq(p)->lock)); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +static void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + s64 delta; + + if (rq->skip_clock_update) + return; + + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); } /* @@ -722,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -733,16 +739,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); - if (strncmp(buf, "NO_", 3) == 0) { + if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; cmp += 3; } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -792,20 +797,6 @@ late_initcall(sched_init_debug); const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* * period over which we average the RT time consumption, measured * in ms. * @@ -969,14 +960,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void task_rq_unlock_wait(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&rq->lock); -} - static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { @@ -1202,6 +1185,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1241,16 +1245,6 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -int nohz_ratelimit(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 diff = rq->clock - rq->nohz_stamp; - - rq->nohz_stamp = rq->clock; - - return diff < (NSEC_PER_SEC / HZ) >> 1; -} - #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1263,6 +1257,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } @@ -1284,6 +1284,10 @@ static void resched_task(struct task_struct *p) static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } + +static void sched_avg_update(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -1341,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1507,24 +1517,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1544,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; - unsigned long flags; - int i; - - if (!tg->se[0]) - return 0; - - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; - - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } - - if (!rq_weight) - rq_weight = sum_weight; - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); - - local_irq_restore(flags); - - return 0; -} - /* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child @@ -1653,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data) load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1662,37 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(struct sched_domain *sd) -{ - s64 elapsed; - u64 now; - - if (root_task_group_empty()) - return; - - now = cpu_clock(raw_smp_processor_id()); - elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } -} - static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - #endif #ifdef CONFIG_PREEMPT @@ -1814,18 +1688,10 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -1843,7 +1709,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1861,12 +1727,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -1920,13 +1780,194 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ + s64 irq_delta; + + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + rq->clock_task += delta; + + if (irq_delta && sched_feat(NONIRQ_POWER)) + sched_rt_avg_update(rq, irq_delta); +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ + rq->clock_task += delta; +} + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -1994,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ + const struct sched_class *class; + + if (p->sched_class == rq->curr->sched_class) { + rq->curr->sched_class->check_preempt_curr(rq, p, flags); + } else { + for_each_class(class) { + if (class == rq->curr->sched_class) + break; + if (class == p->sched_class) { + resched_task(rq->curr); + break; + } + } + } + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + rq->skip_clock_update = 1; +} + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -2006,6 +2072,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2056,10 +2125,8 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, int dest_cpu) +static bool migrate_task(struct task_struct *p, struct rq *rq) { - struct rq *rq = task_rq(p); - /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. @@ -2239,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (unlikely(dest_cpu >= nr_cpu_ids)) { - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -2288,11 +2352,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2300,7 +2408,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2380,38 +2489,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2420,6 +2502,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2515,7 +2628,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2528,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif put_cpu(); } @@ -2797,14 +2921,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -2885,9 +3009,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } @@ -2919,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + #ifdef CONFIG_NO_HZ /* * For NO_HZ we delay the active fold to the next LOAD_FREQ update. @@ -2948,6 +3081,128 @@ static long calc_load_fold_idle(void) return delta; } + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ + long delta, active, n; + + if (time_before(jiffies, calc_load_update)) + return; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * If we were idle for multiple load cycles, apply them. + */ + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ +} #else static void calc_load_account_idle(struct rq *this_rq) { @@ -2957,6 +3212,10 @@ static inline long calc_load_fold_idle(void) { return 0; } + +static void calc_global_nohz(unsigned long ticks) +{ +} #endif /** @@ -2974,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; -} - /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ -void calc_global_load(void) +void calc_global_load(unsigned long ticks) { - unsigned long upd = calc_load_update + 10; long active; - if (time_before(jiffies, upd)) + calc_global_nohz(ticks); + + if (time_before(jiffies, calc_load_update + 10)) return; active = atomic_long_read(&calc_load_tasks); @@ -3024,23 +3276,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3048,10 +3379,18 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } + sched_avg_update(this_rq); +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); + calc_load_account_active(this_rq); } @@ -3077,7 +3416,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -3106,7 +3445,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -3255,7 +3594,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3371,9 +3710,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * utime); + temp *= utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3404,9 +3743,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * cputime.utime); + temp *= cputime.utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3438,11 +3777,11 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr); + perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -3548,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif @@ -3558,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->se.on_rq) update_rq_clock(rq); - rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@ -3581,17 +3919,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -3610,7 +3944,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3621,13 +3954,27 @@ need_resched_nonpreemptible: hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3638,6 +3985,8 @@ need_resched_nonpreemptible: put_prev_task(rq, prev); next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; if (likely(prev != next)) { sched_info_switch(prev, next); @@ -3649,8 +3998,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3659,11 +4010,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -3716,8 +4064,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) /* * Owner changed, break to re-assess state. */ - if (lock->owner != owner) + if (lock->owner != owner) { + /* + * If the lock has switched to a different owner, + * we likely have heavy contention. Return 0 to quit + * optimistic spinning and not contend further: + */ + if (lock->owner) + return 0; break; + } /* * Is that owner really running on that cpu? @@ -3725,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) if (task_thread_info(rq->curr) != owner || need_resched()) return 0; - cpu_relax(); + arch_mutex_cpu_relax(); } return 1; @@ -3738,7 +4094,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -3750,9 +4106,9 @@ asmlinkage void __sched preempt_schedule(void) return; do { - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -4037,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { @@ -4062,6 +4418,23 @@ int __sched wait_for_completion_killable(struct completion *x) EXPORT_SYMBOL(wait_for_completion_killable); /** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * @@ -4178,6 +4551,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; @@ -4394,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4436,12 +4810,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -4469,17 +4839,7 @@ recheck: } if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4494,6 +4854,32 @@ recheck: * runqueue lock must be held. */ rq = __task_rq_lock(p); + + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + } +#endif + /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; @@ -4538,7 +4924,7 @@ recheck: * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } @@ -4556,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } @@ -4705,13 +5091,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -5072,7 +5458,7 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) @@ -5155,7 +5541,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -5224,7 +5622,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -5300,7 +5697,7 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, dest_cpu)) { + if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); @@ -5382,29 +5779,20 @@ static int migration_cpu_stop(void *data) } #ifdef CONFIG_HOTPLUG_CPU + /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. */ -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void) { - struct rq *rq = cpu_rq(dead_cpu); - int needs_cpu, uninitialized_var(dest_cpu); - unsigned long flags; + struct mm_struct *mm = current->active_mm; - local_irq_save(flags); + BUG_ON(cpu_online(smp_processor_id())); - raw_spin_lock(&rq->lock); - needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); - if (needs_cpu) - dest_cpu = select_fallback_rq(dead_cpu, p); - raw_spin_unlock(&rq->lock); - /* - * It can only fail if we race with set_cpus_allowed(), - * in the racer should migrate the task anyway. - */ - if (needs_cpu) - __migrate_task(p, dead_cpu, dest_cpu); - local_irq_restore(flags); + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); } /* @@ -5417,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) static void migrate_nr_uninterruptible(struct rq *rq_src) { struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - unsigned long flags; - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); } /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks. */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq) { - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - raw_spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - activate_task(rq, p, 0); - - raw_spin_unlock_irqrestore(&rq->lock, flags); + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. */ - raw_spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - raw_spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + rq->stop = NULL; for ( ; ; ) { - if (!rq->nr_running) + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) break; + next = pick_next_task(rq); - if (!next) - break; + BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); } -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + rq->stop = stop; } + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -5748,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -5768,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_live_tasks(cpu); - /* Idle task back to normal (off runqueue, low prio) */ - raw_spin_lock_irq(&rq->lock); - deactivate_task(rq, rq->idle, 0); - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - raw_spin_unlock_irq(&rq->lock); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - calc_global_load_remove(rq); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); break; #endif } @@ -5805,20 +6121,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6053,23 +6398,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6085,7 +6425,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6098,7 +6438,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -6308,6 +6648,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6319,6 +6660,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6354,31 +6696,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; + *sg = &per_cpu(sched_group_book, group).sg; + return group; } -#endif +#endif /* CONFIG_SCHED_BOOK */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6388,7 +6747,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6584,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_first_cpu(sd->groups)) return; + sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + child = sd->child; sd->groups->cpu_power = 0; @@ -6649,6 +7013,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6698,6 +7065,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6744,8 +7113,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -6803,6 +7174,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -6860,6 +7248,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; +#endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); if (!cpumask_empty(d->nodemask)) @@ -6907,12 +7304,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -6943,6 +7342,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -6968,6 +7373,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif @@ -7277,29 +7684,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7345,10 +7758,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7427,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, + struct sched_entity *se, int cpu, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; - /* se could be NULL for init_task_group */ + /* se could be NULL for root_task_group */ if (!se) return; @@ -7448,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, 0); se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent) { struct rq *rq = cpu_rq(cpu); @@ -7465,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; if (!rt_se) @@ -7501,18 +7907,18 @@ void __init sched_init(void) ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); #ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.se = (struct sched_entity **)ptr; + root_task_group.se = (struct sched_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); - init_task_group.cfs_rq = (struct cfs_rq **)ptr; + root_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_se = (struct sched_rt_entity **)ptr; + root_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); - init_task_group.rt_rq = (struct rt_rq **)ptr; + root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ @@ -7532,20 +7938,16 @@ void __init sched_init(void) global_rt_period(), global_rt_runtime()); #ifdef CONFIG_RT_GROUP_SCHED - init_rt_bandwidth(&init_task_group.rt_bandwidth, + init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED - list_add(&init_task_group.list, &task_groups); - INIT_LIST_HEAD(&init_task_group.children); - + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); + autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -7557,45 +7959,45 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - init_task_group.shares = init_task_group_load; + root_task_group.shares = root_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED /* - * How much cpu bandwidth does init_task_group get? + * How much cpu bandwidth does root_task_group get? * * In case of task-groups formed thr' the cgroup filesystem, it * gets 100% of the cpu resources in the system. This overall * system cpu resource is divided among the tasks of - * init_task_group and its child task-groups in a fair manner, + * root_task_group and its child task-groups in a fair manner, * based on each entity's (task or task-group's) weight * (se->load.weight). * - * In other words, if init_task_group has 10 tasks of weight + * In other words, if root_task_group has 10 tasks of weight * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * - * We achieve this by letting init_task_group's tasks sit - * directly in rq->cfs (i.e init_task_group->se[] = NULL). + * We achieve this by letting root_task_group's tasks sit + * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#endif + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#endif + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -7605,6 +8007,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7649,16 +8055,17 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_event_init(); - scheduler_running = 1; } @@ -7852,26 +8259,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) @@ -7884,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } @@ -7942,27 +8351,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} #else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { @@ -7973,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED @@ -7988,6 +8378,7 @@ static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); + autogroup_free(tg); kfree(tg); } @@ -7996,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; - int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -8009,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) goto err; spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } list_add_rcu(&tg->list, &task_groups); WARN_ON(!parent); /* root should already exist */ @@ -8042,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) unsigned long flags; int i; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { + /* end participation in shares distribution */ + for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } + + spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -8076,12 +8462,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8093,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) @@ -8142,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ tg->shares = shares; for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); return 0; @@ -8307,7 +8648,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -8471,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - return &init_task_group.css; + return &root_task_group.css; } parent = cgroup_tg(cgrp->parent); @@ -8542,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8614,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, @@ -8898,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = { }; #endif /* CONFIG_CGROUP_CPUACCT */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - barrier(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - */ -void synchronize_sched_expedited(void) -{ - int snap, trycount = 0; - - smp_mb(); /* ensure prior mod happens before capturing snap. */ - snap = atomic_read(&synchronize_sched_expedited_count) + 1; - get_online_cpus(); - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - get_online_cpus(); - } - atomic_inc(&synchronize_sched_expedited_count); - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#endif /* #else #ifndef CONFIG_SMP */ |