diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 367 |
1 files changed, 193 insertions, 174 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 3798b954e6e..748ff924a29 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -267,6 +267,10 @@ struct task_group { struct cgroup_subsys_state css; #endif +#ifdef CONFIG_USER_SCHED + uid_t uid; +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; @@ -292,6 +296,12 @@ struct task_group { #ifdef CONFIG_USER_SCHED +/* Helper function to pass uid information to create_sched_user() */ +void set_tg_uid(struct user_struct *user) +{ + user->tg->uid = user->uid; +} + /* * Root task group. * Every UID task group (including init_task_group aka UID-0) will @@ -594,6 +604,8 @@ struct rq { #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_exp_empty; @@ -711,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = { #undef SCHED_FEAT -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_show(struct seq_file *m, void *v) { - char *buf; - int r = 0; - int len = 0; int i; for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; - } - - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); } + seq_puts(m, "\n"); - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; + return 0; } static ssize_t @@ -794,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return cnt; } +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static __init int sched_init_debug(void) @@ -1482,27 +1474,13 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - int boost = 0; unsigned long shares; unsigned long rq_weight; if (!tg->se[cpu]) return; - rq_weight = tg->cfs_rq[cpu]->load.weight; - - /* - * If there are currently no tasks on the cpu pretend there is one of - * average load so that when a new task gets to run here it will not - * get delayed by group starvation. - */ - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - if (unlikely(rq_weight > sd_rq_weight)) - rq_weight = sd_rq_weight; + rq_weight = tg->cfs_rq[cpu]->rq_weight; /* * \Sum shares * rq_weight @@ -1510,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, * \Sum rq_weight * */ - shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); if (abs(shares - tg->se[cpu]->load.weight) > @@ -1519,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + tg->cfs_rq[cpu]->shares = shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1537,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long rq_weight = 0; + unsigned long weight, rq_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { - rq_weight += tg->cfs_rq[i]->load.weight; + /* + * If there are currently no tasks on the cpu pretend there + * is one of average load so that when a new task gets to + * run here it will not get delayed by group starvation. + */ + weight = tg->cfs_rq[i]->load.weight; + if (!weight) + weight = NICE_0_LOAD; + + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1553,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - if (!rq_weight) - rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) update_group_shares_cpu(tg, i, shares, rq_weight); @@ -1620,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) #endif +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + spin_unlock(&this_rq->lock); + BUG_ON(1); + } + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); + ret = 1; + } else + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); + } + return ret; +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} #endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2264,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) smp_wmb(); rq = task_rq_lock(p, &flags); + update_rq_clock(rq); old_state = p->state; if (!(old_state & state)) goto out; @@ -2321,7 +2336,6 @@ out_activate: schedstat_inc(p, se.nr_wakeups_local); else schedstat_inc(p, se.nr_wakeups_remote); - update_rq_clock(rq); activate_task(rq, p, 1); success = 1; @@ -2822,40 +2836,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - spin_unlock(&this_rq->lock); - BUG_ON(1); - } - if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - spin_unlock(&this_rq->lock); - spin_lock(&busiest->lock); - spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); - ret = 1; - } else - spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); - } - return ret; -} - -static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then @@ -3716,7 +3696,7 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; - int pulled_task = -1; + int pulled_task = 0; unsigned long next_balance = jiffies + HZ; cpumask_t tmpmask; @@ -6150,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) /* * Figure out where task on dead CPU should go, use force if necessary. - * NOTE: interrupts should be disabled by the caller */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { @@ -6662,28 +6641,6 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6703,8 +6660,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " @@ -6840,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; } if (~cflags & pflags) return 0; @@ -7360,13 +7318,21 @@ struct allmasks { }; #if NR_CPUS > 128 -#define SCHED_CPUMASK_ALLOC 1 -#define SCHED_CPUMASK_FREE(v) kfree(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ + *masks = kmalloc(sizeof(**masks), GFP_KERNEL); +} +static inline void sched_cpumask_free(struct allmasks *masks) +{ + kfree(masks); +} #else -#define SCHED_CPUMASK_ALLOC 0 -#define SCHED_CPUMASK_FREE(v) -#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +static inline void sched_cpumask_alloc(struct allmasks **masks) +{ } +static inline void sched_cpumask_free(struct allmasks *masks) +{ } #endif #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ @@ -7442,9 +7408,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map, return -ENOMEM; } -#if SCHED_CPUMASK_ALLOC /* get space for all scratch cpumask variables */ - allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + sched_cpumask_alloc(&allmasks); if (!allmasks) { printk(KERN_WARNING "Cannot alloc cpumask array\n"); kfree(rd); @@ -7453,7 +7418,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, #endif return -ENOMEM; } -#endif + tmpmask = (cpumask_t *)allmasks; @@ -7707,13 +7672,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map, cpu_attach_domain(sd, rd, i); } - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); return 0; #ifdef CONFIG_NUMA error: free_sched_groups(cpu_map, tmpmask); - SCHED_CPUMASK_FREE((void *)allmasks); + sched_cpumask_free(allmasks); kfree(rd); return -ENOMEM; #endif @@ -7736,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur; */ static cpumask_t fallback_doms; -void __attribute__((weak)) arch_update_cpu_topology(void) +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) { + return 0; } /* @@ -7777,8 +7748,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - unregister_sched_domain_sysctl(); - for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); @@ -7831,17 +7800,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { int i, j, n; + int new_topology; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < n; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7856,12 +7829,12 @@ match1: ndoms_cur = 0; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; + WARN_ON_ONCE(dattr_new); } /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur; j++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { if (cpus_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; @@ -8516,7 +8489,7 @@ static int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; + struct sched_entity *se; struct rq *rq; int i; @@ -8532,18 +8505,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); if (!se) goto err; - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; @@ -8604,7 +8576,7 @@ static int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -8621,18 +8593,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) goto err; - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); if (!rt_se) goto err; - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; @@ -9275,11 +9246,12 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks */ +/* track cpu usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct cpuacct *parent; }; struct cgroup_subsys cpuacct_subsys; @@ -9313,6 +9285,9 @@ static struct cgroup_subsys_state *cpuacct_create( return ERR_PTR(-ENOMEM); } + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + return &ca->css; } @@ -9326,6 +9301,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) kfree(ca); } +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + /* return total cpu usage (in nanoseconds) of a group */ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { @@ -9333,17 +9343,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) u64 totalcpuusage = 0; int i; - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); - - /* - * Take rq->lock to make 64-bit addition safe on 32-bit - * platforms. - */ - spin_lock_irq(&cpu_rq(i)->lock); - totalcpuusage += *cpuusage; - spin_unlock_irq(&cpu_rq(i)->lock); - } + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); return totalcpuusage; } @@ -9360,23 +9361,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, goto out; } - for_each_possible_cpu(i) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); - spin_lock_irq(&cpu_rq(i)->lock); - *cpuusage = 0; - spin_unlock_irq(&cpu_rq(i)->lock); - } out: return err; } +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + static struct cftype files[] = { { .name = "usage", .read_u64 = cpuusage_read, .write_u64 = cpuusage_write, }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -9392,14 +9409,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) static void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; + int cpu; if (!cpuacct_subsys.active) return; + cpu = task_cpu(tsk); ca = task_ca(tsk); - if (ca) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); + for (; ca; ca = ca->parent) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } |