diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 17 | ||||
-rw-r--r-- | kernel/events/core.c | 1 | ||||
-rw-r--r-- | kernel/irq/chip.c | 8 | ||||
-rw-r--r-- | kernel/irq/internals.h | 3 | ||||
-rw-r--r-- | kernel/irq/manage.c | 39 | ||||
-rw-r--r-- | kernel/irq/migration.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 249 | ||||
-rw-r--r-- | kernel/sched/fair.c | 71 | ||||
-rw-r--r-- | kernel/sched/rt.c | 53 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/smpboot.c | 17 | ||||
-rw-r--r-- | kernel/sys.c | 60 | ||||
-rw-r--r-- | kernel/time/clockevents.c | 3 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 19 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 |
15 files changed, 389 insertions, 168 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f3527d6184..72fcd3069a9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -896,10 +896,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) mutex_unlock(&cgroup_mutex); /* - * Drop the active superblock reference that we took when we - * created the cgroup + * We want to drop the active superblock reference from the + * cgroup creation after all the dentry refs are gone - + * kill_sb gets mighty unhappy otherwise. Mark + * dentry->d_fsdata with cgroup_diput() to tell + * cgroup_d_release() to call deactivate_super(). */ - deactivate_super(cgrp->root->sb); + dentry->d_fsdata = cgroup_diput; /* * if we're getting rid of the cgroup, refcount should ensure @@ -925,6 +928,13 @@ static int cgroup_delete(const struct dentry *d) return 1; } +static void cgroup_d_release(struct dentry *dentry) +{ + /* did cgroup_diput() tell me to deactivate super? */ + if (dentry->d_fsdata == cgroup_diput) + deactivate_super(dentry->d_sb); +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1532,6 +1542,7 @@ static int cgroup_get_rootdir(struct super_block *sb) static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, .d_delete = cgroup_delete, + .d_release = cgroup_d_release, }; struct inode *inode = diff --git a/kernel/events/core.c b/kernel/events/core.c index 5b06cbbf693..f85c0154b33 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3181,7 +3181,6 @@ static void perf_event_for_each(struct perf_event *event, event = event->group_leader; perf_event_for_each_child(event, func); - func(event); list_for_each_entry(sibling, &event->sibling_list, group_entry) perf_event_for_each_child(sibling, func); mutex_unlock(&ctx->mutex); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc275e4f629..eebd6d5cfb4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -275,8 +275,10 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -324,8 +326,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; goto out_unlock; + } handle_irq_event(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8e5c56b3b7d..001fa5bab49 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -101,6 +101,9 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); +extern int irq_do_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force); + /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ea0c6c2ae6f..8c548232ba3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -142,6 +142,25 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif +int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_desc *desc = irq_data_to_desc(data); + struct irq_chip *chip = irq_data_get_irq_chip(data); + int ret; + + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + + return ret; +} + int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { struct irq_chip *chip = irq_data_get_irq_chip(data); @@ -152,14 +171,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = chip->irq_set_affinity(data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(data->affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - ret = 0; - } + ret = irq_do_set_affinity(data, mask, false); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -283,9 +295,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; - int ret, node = desc->irq_data.node; + int node = desc->irq_data.node; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -311,13 +322,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(mask, nodemask)) cpumask_and(mask, mask, nodemask); } - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } + irq_do_set_affinity(&desc->irq_data, mask, false); return 0; } #else diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index c3c89751b32..ca3f4aaff70 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -42,17 +42,8 @@ void irq_move_masked_irq(struct irq_data *idata) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) - < nr_cpu_ids)) { - int ret = chip->irq_set_affinity(&desc->irq_data, - desc->pending_mask, false); - switch (ret) { - case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, desc->pending_mask); - case IRQ_SET_MASK_OK_NOCOPY: - irq_set_thread_affinity(desc); - } - } + if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) + irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false); cpumask_clear(desc->pending_mask); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc3..d5594a4268d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = #define SCHED_FEAT(name, enabled) \ #name , -static __read_mostly char *sched_feat_names[] = { +static const char * const sched_feat_names[] = { #include "features.h" - NULL }; #undef SCHED_FEAT @@ -2517,25 +2516,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2547,12 +2553,38 @@ void update_idle_cpu_load(struct rq *this_rq) } /* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + +/* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); @@ -4982,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* @@ -5524,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ #ifdef CONFIG_SCHED_DEBUG -static __read_mostly int sched_domain_debug_enabled; +static __read_mostly int sched_debug_enabled; -static int __init sched_domain_debug_setup(char *str) +static int __init sched_debug_setup(char *str) { - sched_domain_debug_enabled = 1; + sched_debug_enabled = 1; return 0; } -early_param("sched_debug", sched_domain_debug_setup); +early_param("sched_debug", sched_debug_setup); + +static inline bool sched_debug(void) +{ + return sched_debug_enabled; +} static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) @@ -5572,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->sgp->power) { + /* + * Even though we initialize ->power to something semi-sane, + * we leave power_orig unset. This allows us to detect if + * domain iteration is still funny without causing /0 traps. + */ + if (!group->sgp->power_orig) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5620,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_domain_debug_enabled) + if (!sched_debug_enabled) return; if (!sd) { @@ -5641,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ # define sched_domain_debug(sd, cpu) do { } while (0) +static inline bool sched_debug(void) +{ + return false; +} #endif /* CONFIG_SCHED_DEBUG */ static int sd_degenerate(struct sched_domain *sd) @@ -5962,6 +6008,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -5980,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -5987,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -5997,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); - sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); - atomic_inc(&sg->sgp->ref); + sg->sgp = *per_cpu_ptr(sdd->sgp, i); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); - if (cpumask_test_cpu(cpu, sg_span)) + /* + * Initialize sgp->power such that even if we mess up the + * domains and no possible iteration will get us here, we won't + * die on a /0 trap. + */ + sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ + if ((!groups && cpumask_test_cpu(cpu, sg_span)) || + group_balance_cpu(sg) == cpu) groups = sg; if (!first) @@ -6074,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6115,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6165,11 +6268,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -6279,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol #ifdef CONFIG_NUMA static int sched_domains_numa_levels; -static int sched_domains_numa_scale; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; static inline int sd_local_flags(int level) { - if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) + if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) return 0; return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; @@ -6344,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu) return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; } +static void sched_numa_warn(const char *str) +{ + static int done = false; + int i,j; + + if (done) + return; + + done = true; + + printk(KERN_WARNING "ERROR: %s\n\n", str); + + for (i = 0; i < nr_node_ids; i++) { + printk(KERN_WARNING " "); + for (j = 0; j < nr_node_ids; j++) + printk(KERN_CONT "%02d ", node_distance(i,j)); + printk(KERN_CONT "\n"); + } + printk(KERN_WARNING "\n"); +} + +static bool find_numa_distance(int distance) +{ + int i; + + if (distance == node_distance(0, 0)) + return true; + + for (i = 0; i < sched_domains_numa_levels; i++) { + if (sched_domains_numa_distance[i] == distance) + return true; + } + + return false; +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6351,7 +6486,6 @@ static void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_scale = curr_distance; sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); if (!sched_domains_numa_distance) return; @@ -6362,23 +6496,41 @@ static void sched_init_numa(void) * * Assumes node_distance(0,j) includes all distances in * node_distance(i,j) in order to avoid cubic time. - * - * XXX: could be optimized to O(n log n) by using sort() */ next_distance = curr_distance; for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - int distance = node_distance(0, j); - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; + for (k = 0; k < nr_node_ids; k++) { + int distance = node_distance(i, k); + + if (distance > curr_distance && + (distance < next_distance || + next_distance == curr_distance)) + next_distance = distance; + + /* + * While not a strong assumption it would be nice to know + * about cases where if node A is connected to B, B is not + * equally connected to A. + */ + if (sched_debug() && node_distance(k, i) != distance) + sched_numa_warn("Node-distance not symmetric"); + + if (sched_debug() && i && !find_numa_distance(distance)) + sched_numa_warn("Node-0 not representative"); + } + if (next_distance != curr_distance) { + sched_domains_numa_distance[level++] = next_distance; + sched_domains_numa_levels = level; + curr_distance = next_distance; + } else break; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + /* + * In case of sched_debug() we verify the above assumption. + */ + if (!sched_debug()) + break; } /* * 'level' contains the number of unique distances, excluding the @@ -6403,7 +6555,7 @@ static void sched_init_numa(void) return; for (j = 0; j < nr_node_ids; j++) { - struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); + struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); if (!mask) return; @@ -6490,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; @@ -6543,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -6551,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } @@ -6691,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); - dattr_cur = NULL; err = build_sched_domains(doms_cur[0], NULL); register_sched_domain_sysctl(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf9..c099cc6eebe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available; + u64 total, available, age_stamp, avg; - total = sched_avg_period() + (rq->clock - rq->age_stamp); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = ACCESS_ONCE(rq->age_stamp); + avg = ACCESS_ONCE(rq->rt_avg); + + total = sched_avg_period() + (rq->clock - age_stamp); - if (unlikely(total < rq->rt_avg)) { + if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ available = 0; } else { - available = total - rq->rt_avg; + available = total - avg; } if (unlikely((s64)total < SCHED_POWER_SCALE)) @@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) power = 0; - group = child->groups; - do { - power += group->sgp->power; - group = group->next; - } while (group != child->groups); + if (child->flags & SD_OVERLAP) { + /* + * SD_OVERLAP domains cannot assume that child groups + * span the current group. + */ - sdg->sgp->power = power; + for_each_cpu(cpu, sched_group_cpus(sdg)) + power += power_of(cpu); + } else { + /* + * !SD_OVERLAP domains can assume that child groups + * span the current group. + */ + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + } + + sdg->sgp->power_orig = sdg->sgp->power = power; } /* @@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. - * @sd: The sched_domain whose statistics are to be updated. + * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. @@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } @@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, /** * update_sd_pick_busiest - return 1 on busiest group - * @sd: sched_domain whose statistics are to be checked + * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics - * @this_cpu: the current cpu * * Determine if @sg is a busier group than the previously selected * busiest group. @@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. - * @sd: sched_domain whose statistics are to be updated. - * @this_cpu: Cpu for which load balance is currently performed. - * @idle: Idle status of this_cpu + * @env: The load balancing environment. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, * Returns 1 when packing is required and a task should be moved to * this CPU. The amount of the imbalance is returned in *imbalance. * - * @sd: The sched_domain whose packing is to be checked. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: returns amount of imbalanced due to packing. */ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) { @@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) * fix_small_imbalance - Calculate the minor imbalance that exists * amongst the groups of a sched_domain, during * load balancing. + * @env: The load balancing environment. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. - * @this_cpu: The cpu at whose sched_domain we're performing load-balance. - * @imbalance: Variable to store the imbalance. */ static inline void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) @@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Also calculates the amount of weighted load which should be moved * to restore balance. * - * @sd: The sched_domain whose busiest group is to be returned. - * @this_cpu: The cpu for which load balancing is currently being performed. - * @imbalance: Variable which stores amount of weighted load which should - * be moved to restore balance/put a group to idle. - * @idle: The idle status of this_cpu. + * @env: The load balancing environment. * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515..573e1ca0110 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) task_running(rq, task) || !task->on_rq)) { - raw_spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24c..6d52cea7f33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" diff --git a/kernel/smpboot.c b/kernel/smpboot.c index e1a797e028a..98f60c5caa1 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -31,6 +31,12 @@ void __init idle_thread_set_boot_cpu(void) per_cpu(idle_threads, smp_processor_id()) = current; } +/** + * idle_init - Initialize the idle thread for a cpu + * @cpu: The cpu for which the idle thread should be initialized + * + * Creates the thread if it does not exist. + */ static inline void idle_init(unsigned int cpu) { struct task_struct *tsk = per_cpu(idle_threads, cpu); @@ -45,17 +51,16 @@ static inline void idle_init(unsigned int cpu) } /** - * idle_thread_init - Initialize the idle thread for a cpu - * @cpu: The cpu for which the idle thread should be initialized - * - * Creates the thread if it does not exist. + * idle_threads_init - Initialize idle threads for all cpus */ void __init idle_threads_init(void) { - unsigned int cpu; + unsigned int cpu, boot_cpu; + + boot_cpu = smp_processor_id(); for_each_possible_cpu(cpu) { - if (cpu != smp_processor_id()) + if (cpu != boot_cpu) idle_init(cpu); } } diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657..f0ec44dcd41 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1786,27 +1786,13 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE -static bool vma_flags_mismatch(struct vm_area_struct *vma, - unsigned long required, - unsigned long banned) -{ - return (vma->vm_flags & required) != required || - (vma->vm_flags & banned); -} - static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { + struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; - /* - * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's - * remain. So perform a quick test first. - */ - if (mm->num_exe_file_vmas) - return -EBUSY; - exe_file = fget(fd); if (!exe_file) return -EBADF; @@ -1827,17 +1813,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if there are mapped other files. + */ + err = -EBUSY; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && !path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_unlock; + } + /* * The symlink can be changed only once, just to disallow arbitrary * transitions malicious software might bring in. This means one * could make a snapshot over all processes running and monitor * /proc/pid/exe changes to notice unusual activity if needed. */ - down_write(&mm->mmap_sem); - if (likely(!mm->exe_file)) - set_mm_exe_file(mm, exe_file); - else - err = -EBUSY; + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + set_mm_exe_file(mm, exe_file); +exit_unlock: up_write(&mm->mmap_sem); exit: @@ -1862,7 +1861,7 @@ static int prctl_set_mm(int opt, unsigned long addr, if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsigned int)addr); - if (addr >= TASK_SIZE) + if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL; error = -EINVAL; @@ -1924,12 +1923,6 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EFAULT; goto out; } -#ifdef CONFIG_STACK_GROWSUP - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) -#else - if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) -#endif - goto out; if (opt == PR_SET_MM_START_STACK) mm->start_stack = addr; else if (opt == PR_SET_MM_ARG_START) @@ -1981,12 +1974,22 @@ out: up_read(&mm->mmap_sem); return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -2124,6 +2127,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; default: return -EINVAL; } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c..7e1ce012a85 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff56..da70c6db496 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* @@ -814,6 +815,16 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +static int sched_skew_tick; + +static int __init skew_tick(char *str) +{ + get_option(&str, &sched_skew_tick); + + return 0; +} +early_param("skew_tick", skew_tick); + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -831,6 +842,14 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset the tick to avert xtime_lock contention. */ + if (sched_skew_tick) { + u64 offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= smp_processor_id(); + hrtimer_add_expires_ns(&ts->sched_timer, offset); + } + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6e46cacf596..6f46a00a1e8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -962,6 +962,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -1077,6 +1078,7 @@ static void update_wall_time(void) timekeeper.xtime.tv_sec++; leap = second_overflow(timekeeper.xtime.tv_sec); timekeeper.xtime.tv_sec += leap; + timekeeper.wall_to_monotonic.tv_sec -= leap; } timekeeping_update(false); |