From 4d9a5d4319e22670ec6d6227e12b54f361c46d0f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Oct 2012 01:47:16 +0200 Subject: rcu: Remove rcu_switch() It's only there to call rcu_user_hooks_switch(). Let's just call rcu_user_hooks_switch() directly, we don't need this function in the middle. Signed-off-by: Frederic Weisbecker Cc: Josh Triplett Cc: Peter Zijlstra Cc: Richard Weinberger Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..432cc5e1bbe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1844,14 +1844,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif -static inline void rcu_switch(struct task_struct *prev, - struct task_struct *next) -{ -#ifdef CONFIG_RCU_USER_QS - rcu_user_hooks_switch(prev, next); -#endif -} - static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { -- cgit v1.2.3-70-g09d2 From b637a328bd4f43a0e146d1eef0142b650ba0d644 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 19 Sep 2012 16:58:38 -0700 Subject: rcu: Print remote CPU's stacks in stall warnings The RCU CPU stall warnings rely on trigger_all_cpu_backtrace() to do NMI-based dump of the stack traces of all CPUs. Unfortunately, a number of architectures do not implement trigger_all_cpu_backtrace(), in which case RCU falls back to just dumping the stack of the running CPU. This is unhelpful in the case where the running CPU has detected that some other CPU has stalled. This commit therefore makes the running CPU dump the stacks of the tasks running on the stalled CPUs. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 2 ++ kernel/rcutree.c | 25 ++++++++++++++++++++++++- kernel/sched/core.c | 6 ++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..ba69b5adea3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -109,6 +109,8 @@ extern void update_cpu_load_nohz(void); extern unsigned long get_parent_ip(unsigned long addr); +extern void dump_cpu_task(int cpu); + struct seq_file; struct cfs_rq; struct task_group; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 74df86bd920..e78538712df 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -873,6 +873,29 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } +/* + * Dump stacks of all tasks running on stalled CPUs. This is a fallback + * for architectures that do not implement trigger_all_cpu_backtrace(). + * The NMI-triggered stack traces are more accurate because they are + * printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + dump_cpu_task(rnp->grplo + cpu); + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + static void print_other_cpu_stall(struct rcu_state *rsp) { int cpu; @@ -929,7 +952,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); else if (!trigger_all_cpu_backtrace()) - dump_stack(); + rcu_dump_cpu_stacks(rsp); /* Complain about tasks blocking the grace period. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda71..59d08fb1a9e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8076,3 +8076,9 @@ struct cgroup_subsys cpuacct_subsys = { .base_cftypes = files, }; #endif /* CONFIG_CGROUP_CPUACCT */ + +void dump_cpu_task(int cpu) +{ + pr_info("Task dump for CPU %d:\n", cpu); + sched_show_task(cpu_curr(cpu)); +} -- cgit v1.2.3-70-g09d2 From 9d85f21c94f7f7a84d0ba686c58aa6d9da58fdbb Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:29 +0200 Subject: sched: Track the runnable average on a per-task entity basis Instead of tracking averaging the load parented by a cfs_rq, we can track entity load directly. With the load for a given cfs_rq then being the sum of its children. To do this we represent the historical contribution to runnable average within each trailing 1024us of execution as the coefficients of a geometric series. We can express this for a given task t as: runnable_sum(t) = \Sum u_i * y^i, runnable_avg_period(t) = \Sum 1024 * y^i load(t) = weight_t * runnable_sum(t) / runnable_avg_period(t) Where: u_i is the usage in the last i`th 1024us period (approximately 1ms) ~ms and y is chosen such that y^k = 1/2. We currently choose k to be 32 which roughly translates to about a sched period. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.372695337@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++ kernel/sched/core.c | 5 ++ kernel/sched/debug.c | 4 ++ kernel/sched/fair.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..418fc6d8a4d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1095,6 +1095,16 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + /* + * These sums represent an infinite geometric series and so are bound + * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * choices of y < 1-2^(-32)*1024. + */ + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1155,6 +1165,9 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +#ifdef CONFIG_SMP + struct sched_avg avg; +#endif }; struct sched_rt_entity { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda71..fd9d0859350 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1524,6 +1524,11 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_SMP + p->se.avg.runnable_avg_period = 0; + p->se.avg.runnable_avg_sum = 0; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 6f79596e0ea..61f70979153 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -85,6 +85,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->statistics.wait_count); #endif P(se->load.weight); +#ifdef CONFIG_SMP + P(se->avg.runnable_avg_sum); + P(se->avg.runnable_avg_period); +#endif #undef PN #undef P } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b99..16d67f9b695 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -971,6 +971,126 @@ static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_SMP +/* + * Approximate: + * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) + */ +static __always_inline u64 decay_load(u64 val, u64 n) +{ + for (; n && val; n--) { + val *= 4008; + val >>= 12; + } + + return val; +} + +/* + * We can represent the historical contribution to runnable average as the + * coefficients of a geometric series. To do this we sub-divide our runnable + * history into segments of approximately 1ms (1024us); label the segment that + * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g. + * + * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ... + * p0 p1 p2 + * (now) (~1ms ago) (~2ms ago) + * + * Let u_i denote the fraction of p_i that the entity was runnable. + * + * We then designate the fractions u_i as our co-efficients, yielding the + * following representation of historical load: + * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ... + * + * We choose y based on the with of a reasonably scheduling period, fixing: + * y^32 = 0.5 + * + * This means that the contribution to load ~32ms ago (u_32) will be weighted + * approximately half as much as the contribution to load within the last ms + * (u_0). + * + * When a period "rolls over" and we have new u_0`, multiplying the previous + * sum again by y is sufficient to update: + * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) + * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] + */ +static __always_inline int __update_entity_runnable_avg(u64 now, + struct sched_avg *sa, + int runnable) +{ + u64 delta; + int delta_w, decayed = 0; + + delta = now - sa->last_runnable_update; + /* + * This should only happen when time goes backwards, which it + * unfortunately does during sched clock init when we swap over to TSC. + */ + if ((s64)delta < 0) { + sa->last_runnable_update = now; + return 0; + } + + /* + * Use 1024ns as the unit of measurement since it's a reasonable + * approximation of 1us and fast to compute. + */ + delta >>= 10; + if (!delta) + return 0; + sa->last_runnable_update = now; + + /* delta_w is the amount already accumulated against our next period */ + delta_w = sa->runnable_avg_period % 1024; + if (delta + delta_w >= 1024) { + /* period roll-over */ + decayed = 1; + + /* + * Now that we know we're crossing a period boundary, figure + * out how much from delta we need to complete the current + * period and accrue it. + */ + delta_w = 1024 - delta_w; + BUG_ON(delta_w > delta); + do { + if (runnable) + sa->runnable_avg_sum += delta_w; + sa->runnable_avg_period += delta_w; + + /* + * Remainder of delta initiates a new period, roll over + * the previous. + */ + sa->runnable_avg_sum = + decay_load(sa->runnable_avg_sum, 1); + sa->runnable_avg_period = + decay_load(sa->runnable_avg_period, 1); + + delta -= delta_w; + /* New period is empty */ + delta_w = 1024; + } while (delta >= 1024); + } + + /* Remainder of delta accrued against u_0` */ + if (runnable) + sa->runnable_avg_sum += delta; + sa->runnable_avg_period += delta; + + return decayed; +} + +/* Update a sched_entity's runnable average */ +static inline void update_entity_load_avg(struct sched_entity *se) +{ + __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, + se->on_rq); +} +#else +static inline void update_entity_load_avg(struct sched_entity *se) {} +#endif + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -1097,6 +1217,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); + update_entity_load_avg(se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1171,6 +1292,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_entity_load_avg(se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1340,6 +1462,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_entity_load_avg(prev); } cfs_rq->curr = NULL; } @@ -1352,6 +1476,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Ensure that runnable average is periodically updated. + */ + update_entity_load_avg(curr); + /* * Update share accounting for long-running entities. */ -- cgit v1.2.3-70-g09d2 From 2dac754e10a5d41d94d2d2365c0345d4f215a266 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Aggregate load contributed by task entities on parenting cfs_rq For a given task t, we can compute its contribution to load as: task_load(t) = runnable_avg(t) * weight(t) On a parenting cfs_rq we can then aggregate: runnable_load(cfs_rq) = \Sum task_load(t), for all runnable children t Maintain this bottom up, with task entities adding their contributed load to the parenting cfs_rq sum. When a task entity's load changes we add the same delta to the maintained sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.514678907@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/debug.c | 3 +++ kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 10 +++++++++- 4 files changed, 60 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 418fc6d8a4d..81d8b1ba410 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + unsigned long load_avg_contrib; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4240abce411..c953a89f94a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); + P(se->avg.load_avg_contrib); #endif #undef PN #undef P @@ -224,6 +225,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->load_contribution); SEQ_printf(m, " .%-30s: %d\n", "load_tg", atomic_read(&cfs_rq->tg->load_weight)); + SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", + cfs_rq->runnable_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5468fcf10..77af759e567 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1081,20 +1081,63 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Compute the current contribution to load_avg by se, return any delta */ +static long __update_entity_load_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.load_avg_contrib; + + if (!entity_is_task(se)) + return 0; + + se->avg.load_avg_contrib = div64_u64(se->avg.runnable_avg_sum * + se->load.weight, + se->avg.runnable_avg_period + 1); + + return se->avg.load_avg_contrib - old_contrib; +} + /* Update a sched_entity's runnable average */ static inline void update_entity_load_avg(struct sched_entity *se) { - __update_entity_runnable_avg(rq_of(cfs_rq_of(se))->clock_task, &se->avg, - se->on_rq); + struct cfs_rq *cfs_rq = cfs_rq_of(se); + long contrib_delta; + + if (!__update_entity_runnable_avg(rq_of(cfs_rq)->clock_task, &se->avg, + se->on_rq)) + return; + + contrib_delta = __update_entity_load_avg_contrib(se); + if (se->on_rq) + cfs_rq->runnable_load_avg += contrib_delta; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); } + +/* Add the load generated by se into cfs_rq's child load-average */ +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; +} + +/* Remove se's load from this cfs_rq child load-average */ +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + update_entity_load_avg(se); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; +} #else static inline void update_entity_load_avg(struct sched_entity *se) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} +static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} +static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, + struct sched_entity *se) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1223,7 +1266,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_entity_load_avg(se); + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1298,7 +1341,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_entity_load_avg(se); + dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14b57196871..e6539736af5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -222,6 +222,15 @@ struct cfs_rq { unsigned int nr_spread_over; #endif +#ifdef CONFIG_SMP + /* + * CFS Load tracking + * Under CFS, load is tracked on a per-entity basis and aggregated up. + * This allows for the description of both thread and group usage (in + * the FAIR_GROUP_SCHED case). + */ + u64 runnable_load_avg; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ @@ -1214,4 +1223,3 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -- cgit v1.2.3-70-g09d2 From 9ee474f55664ff63111c843099d365e7ecffb56f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Maintain the load contribution of blocked entities We are currently maintaining: runnable_load(cfs_rq) = \Sum task_load(t) For all running children t of cfs_rq. While this can be naturally updated for tasks in a runnable state (as they are scheduled); this does not account for the load contributed by blocked task entities. This can be solved by introducing a separate accounting for blocked load: blocked_load(cfs_rq) = \Sum runnable(b) * weight(b) Obviously we do not want to iterate over all blocked entities to account for their decay, we instead observe that: runnable_load(t) = \Sum p_i*y^i and that to account for an additional idle period we only need to compute: y*runnable_load(t). This means that we can compute all blocked entities at once by evaluating: blocked_load(cfs_rq)` = y * blocked_load(cfs_rq) Finally we maintain a decay counter so that when a sleeping entity re-awakens we can determine how much of its load should be removed from the blocked sum. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 1 - kernel/sched/debug.c | 3 ++ kernel/sched/fair.c | 128 +++++++++++++++++++++++++++++++++++++++++++++----- kernel/sched/sched.h | 4 +- 5 files changed, 122 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 81d8b1ba410..b1831accfd8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1103,6 +1103,7 @@ struct sched_avg { */ u32 runnable_avg_sum, runnable_avg_period; u64 last_runnable_update; + s64 decay_count; unsigned long load_avg_contrib; }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd9d0859350..00898f1fb69 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif - #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c953a89f94a..2d2e2b3c1be 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.runnable_avg_sum); P(se->avg.runnable_avg_period); P(se->avg.load_avg_contrib); + P(se->avg.decay_count); #endif #undef PN #undef P @@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) atomic_read(&cfs_rq->tg->load_weight)); SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", cfs_rq->runnable_load_avg); + SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", + cfs_rq->blocked_load_avg); #endif print_cfs_group_stats(m, cpu, cfs_rq->tg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77af759e567..83194175e84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); + static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { @@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) } cfs_rq->on_list = 1; + /* We should have no load, but we need to update last_decay. */ + update_cfs_rq_blocked_load(cfs_rq); } } @@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, return decayed; } +/* Synchronize an entity's decay with its parenting cfs_rq.*/ +static inline void __synchronize_entity_decay(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 decays = atomic64_read(&cfs_rq->decay_counter); + + decays -= se->avg.decay_count; + if (!decays) + return; + + se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.decay_count = 0; +} + /* Compute the current contribution to load_avg by se, return any delta */ static long __update_entity_load_avg_contrib(struct sched_entity *se) { @@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } +static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, + long load_contrib) +{ + if (likely(load_contrib < cfs_rq->blocked_load_avg)) + cfs_rq->blocked_load_avg -= load_contrib; + else + cfs_rq->blocked_load_avg = 0; +} + /* Update a sched_entity's runnable average */ -static inline void update_entity_load_avg(struct sched_entity *se) +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta; @@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se) return; contrib_delta = __update_entity_load_avg_contrib(se); + + if (!update_cfs_rq) + return; + if (se->on_rq) cfs_rq->runnable_load_avg += contrib_delta; + else + subtract_blocked_load_contrib(cfs_rq, -contrib_delta); +} + +/* + * Decay the load contributed by all blocked children and account this so that + * their contribution may appropriately discounted when they wake up. + */ +static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) +{ + u64 now = rq_of(cfs_rq)->clock_task >> 20; + u64 decays; + + decays = now - cfs_rq->last_decay; + if (!decays) + return; + + cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, + decays); + atomic64_add(decays, &cfs_rq->decay_counter); + + cfs_rq->last_decay = now; } static inline void update_rq_runnable_avg(struct rq *rq, int runnable) @@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) /* Add the load generated by se into cfs_rq's child load-average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int wakeup) { - update_entity_load_avg(se); + /* we track migrations using entity decay_count == 0 */ + if (unlikely(!se->avg.decay_count)) { + se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; + wakeup = 0; + } else { + __synchronize_entity_decay(se); + } + + if (wakeup) + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); + + update_entity_load_avg(se, 0); cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + update_cfs_rq_blocked_load(cfs_rq); } -/* Remove se's load from this cfs_rq child load-average */ +/* + * Remove se's load from this cfs_rq child load-average, if the entity is + * transitioning to a blocked state we track its projected decay using + * blocked_load_avg. + */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) + struct sched_entity *se, + int sleep) { - update_entity_load_avg(se); + update_entity_load_avg(se, 1); + cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + if (sleep) { + cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; + se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); + } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } #else -static inline void update_entity_load_avg(struct sched_entity *se) {} +static inline void update_entity_load_avg(struct sched_entity *se, + int update_cfs_rq) {} static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int wakeup) {} static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, - struct sched_entity *se) {} + struct sched_entity *se, + int sleep) {} +static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} #endif static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - enqueue_entity_load_avg(cfs_rq, se); + enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - dequeue_entity_load_avg(cfs_rq, se); + dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { @@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ - update_entity_load_avg(prev); + update_entity_load_avg(prev, 1); } cfs_rq->curr = NULL; } @@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_entity_load_avg(curr); + update_entity_load_avg(curr, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * Update share accounting for long-running entities. @@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); + update_entity_load_avg(se, 1); } if (!se) { @@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) update_rq_clock(rq); update_cfs_load(cfs_rq, 1); + update_cfs_rq_blocked_load(cfs_rq); /* * We need to update shares after updating tg->load_weight in @@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) place_entity(cfs_rq, se, 0); se->vruntime -= cfs_rq->min_vruntime; } + +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + /* + * Remove our load from contribution when we leave sched_fair + * and ensure we don't carry in an old decay_count if we + * switch back. + */ + if (p->se.avg.decay_count) { + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + __synchronize_entity_decay(&p->se); + subtract_blocked_load_contrib(cfs_rq, + p->se.avg.load_avg_contrib); + } +#endif } /* @@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) + atomic64_set(&cfs_rq->decay_counter, 1); +#endif } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6539736af5..664ff39195f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -229,7 +229,9 @@ struct cfs_rq { * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). */ - u64 runnable_load_avg; + u64 runnable_load_avg, blocked_load_avg; + atomic64_t decay_counter; + u64 last_decay; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ -- cgit v1.2.3-70-g09d2 From 0a74bef8bed18dc6889e9bc37ea1050a50c86c89 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:30 +0200 Subject: sched: Add an rq migration call-back to sched_class Since we are now doing bottom up load accumulation we need explicit notification when a task has been re-parented so that the old hierarchy can be updated. Adds: migrate_task_rq(struct task_struct *p, int next_cpu) (The alternative is to do this out of __set_task_cpu, but it was suggested that this would be a cleaner encapsulation.) Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141506.660023400@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 12 ++++++++++++ 3 files changed, 15 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b1831accfd8..e483ccb08ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1061,6 +1061,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 00898f1fb69..f26860074ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -952,6 +952,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + if (p->sched_class->migrate_task_rq) + p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83194175e84..5e602e6ba0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3047,6 +3047,17 @@ unlock: return new_cpu; } + +/* + * Called immediately before a task is migrated to a new cpu; task_cpu(p) and + * cfs_rq_of(p) references at time of call are still valid and identify the + * previous cpu. However, the caller only guarantees p->pi_lock is held; no + * other assumptions, including the state of rq->lock, should be made. + */ +static void +migrate_task_rq_fair(struct task_struct *p, int next_cpu) +{ +} #endif /* CONFIG_SMP */ static unsigned long @@ -5607,6 +5618,7 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, -- cgit v1.2.3-70-g09d2 From f4e26b120b9de84cb627bc7361ba43cfdc51341f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Thu, 4 Oct 2012 13:18:32 +0200 Subject: sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking While per-entity load-tracking is generally useful, beyond computing shares distribution, e.g. runnable based load-balance (in progress), governors, power-management, etc. These facilities are not yet consumers of this data. This may be trivially reverted when the information is required; but avoid paying the overhead for calculations we will not use until then. Signed-off-by: Paul Turner Reviewed-by: Ben Segall Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120823141507.422162369@google.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 13 +++++++++++-- kernel/sched/sched.h | 9 ++++++++- 4 files changed, 32 insertions(+), 5 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e483ccb08ce..e1581a029e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1168,7 +1168,13 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ struct sched_avg avg; #endif }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f26860074ef..5dae0d252ff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1526,7 +1526,12 @@ static void __sched_fork(struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); -#ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) p->se.avg.runnable_avg_period = 0; p->se.avg.runnable_avg_sum = 0; #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ecf455fd95..3e6a3531fa9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,7 +882,8 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP +/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) /* * We choose a half-life close to 1 scheduling period. * Note: The tables below are dependent on this value. @@ -3173,6 +3174,12 @@ unlock: return new_cpu; } +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -3196,6 +3203,7 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } } +#endif #endif /* CONFIG_SMP */ static unsigned long @@ -5773,8 +5781,9 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, +#ifdef CONFIG_FAIR_GROUP_SCHED .migrate_task_rq = migrate_task_rq_fair, - +#endif .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0a75a430ca7..5eca173b563 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -225,6 +225,12 @@ struct cfs_rq { #endif #ifdef CONFIG_SMP +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#ifdef CONFIG_FAIR_GROUP_SCHED /* * CFS Load tracking * Under CFS, load is tracked on a per-entity basis and aggregated up. @@ -234,7 +240,8 @@ struct cfs_rq { u64 runnable_load_avg, blocked_load_avg; atomic64_t decay_counter, removed_load; u64 last_decay; - +#endif /* CONFIG_FAIR_GROUP_SCHED */ +/* These always depend on CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED u32 tg_runnable_contrib; u64 tg_load_contrib; -- cgit v1.2.3-70-g09d2 From 582b336ec2c0f0076f5650a029fcc9abd4a906f7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 27 Nov 2012 23:28:54 -0200 Subject: sched: add notifier for cross-cpu migrations Originally from Jeremy Fitzhardinge. Acked-by: Ingo Molnar Signed-off-by: Marcelo Tosatti --- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..6eb2ed819e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -107,6 +107,14 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); +/* Notifier for when a task gets migrated to a new CPU */ +struct task_migration_notifier { + struct task_struct *task; + int from_cpu; + int to_cpu; +}; +extern void register_task_migration_notifier(struct notifier_block *n); + extern unsigned long get_parent_ip(unsigned long addr); struct seq_file; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda71..c86b8b6e70f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -922,6 +922,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -952,8 +959,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); -- cgit v1.2.3-70-g09d2 From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- fs/proc/array.c | 4 ++-- include/linux/sched.h | 4 ++-- kernel/exit.c | 4 ++-- kernel/sched/cputime.c | 8 ++++---- kernel/sys.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/proc/array.c b/fs/proc/array.c index c1c207c36ca..d3696708fc1 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, min_flt += sig->min_flt; maj_flt += sig->maj_flt; - thread_group_times(task, &utime, &stime); + thread_group_cputime_adjusted(task, &utime, &stime); gtime += sig->gtime; } @@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; - task_times(task, &utime, &stime); + task_cputime_adjusted(task, &utime, &stime); gtime = task->gtime; } diff --git a/include/linux/sched.h b/include/linux/sched.h index e1581a029e3..e75cab5820a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1751,8 +1751,8 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092..618f7ee5600 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e56f138a23c..7dc155371b9 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -445,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -516,7 +516,7 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -543,7 +543,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct signal_struct *sig = p->signal; struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a..265b3769042 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; -- cgit v1.2.3-70-g09d2 From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/sched.h | 23 +++++++++++++++++++---- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++++----------------------- 3 files changed, 43 insertions(+), 28 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e75cab5820a..5dafac36681 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -433,14 +433,29 @@ struct cpu_itimer { u32 incr_error; }; +/** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -581,7 +596,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1340,7 +1355,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..0e7cdb90476 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b9..220fdc4db77 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif -- cgit v1.2.3-70-g09d2 From c4144670fd9b34d6eae22c9f83751745898e8243 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 2 Oct 2012 16:34:38 -0400 Subject: kill daemonize() Signed-off-by: Al Viro --- drivers/staging/gdm72xx/gdm_usb.c | 4 +- fs/file.c | 6 --- fs/fs_struct.c | 24 ---------- include/linux/fdtable.h | 1 - include/linux/fs_struct.h | 1 - include/linux/sched.h | 1 - kernel/exit.c | 92 --------------------------------------- 7 files changed, 1 insertion(+), 128 deletions(-) (limited to 'include/linux/sched.h') diff --git a/drivers/staging/gdm72xx/gdm_usb.c b/drivers/staging/gdm72xx/gdm_usb.c index 0c9e8958009..39db582ab1a 100644 --- a/drivers/staging/gdm72xx/gdm_usb.c +++ b/drivers/staging/gdm72xx/gdm_usb.c @@ -701,8 +701,6 @@ static int k_mode_thread(void *arg) unsigned long flags, flags2, expire; int ret; - daemonize("k_mode_wimax"); - while (!k_mode_stop) { spin_lock_irqsave(&k_lock, flags2); @@ -764,7 +762,7 @@ static struct usb_driver gdm_usb_driver = { static int __init usb_gdm_wimax_init(void) { #ifdef CONFIG_WIMAX_GDM72XX_K_MODE - kthread_run(k_mode_thread, NULL, "WiMax_thread"); + kthread_run(k_mode_thread, NULL, "k_mode_wimax"); #endif /* CONFIG_WIMAX_GDM72XX_K_MODE */ return usb_register(&gdm_usb_driver); } diff --git a/fs/file.c b/fs/file.c index 7cb71b99260..7272a1c5831 100644 --- a/fs/file.c +++ b/fs/file.c @@ -519,12 +519,6 @@ struct files_struct init_files = { .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; -void daemonize_descriptors(void) -{ - atomic_inc(&init_files.count); - reset_files_struct(&init_files); -} - /* * allocate a file descriptor, mark it busy. */ diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 5df4775fea0..fe6ca583bbc 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -164,27 +164,3 @@ struct fs_struct init_fs = { .seq = SEQCNT_ZERO, .umask = 0022, }; - -void daemonize_fs_struct(void) -{ - struct fs_struct *fs = current->fs; - - if (fs) { - int kill; - - task_lock(current); - - spin_lock(&init_fs.lock); - init_fs.users++; - spin_unlock(&init_fs.lock); - - spin_lock(&fs->lock); - current->fs = &init_fs; - kill = !--fs->users; - spin_unlock(&fs->lock); - - task_unlock(current); - if (kill) - free_fs_struct(fs); - } -} diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 45052aa814c..fb7dacae052 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -95,7 +95,6 @@ struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); -void daemonize_descriptors(void); int unshare_files(struct files_struct **); struct files_struct *dup_fd(struct files_struct *, int *); void do_close_on_exec(struct files_struct *); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 003dc0fd734..d0ae3a84bcf 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -21,7 +21,6 @@ extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void free_fs_struct(struct fs_struct *); -extern void daemonize_fs_struct(void); extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..a0166481eb2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2283,7 +2283,6 @@ extern void flush_itimer_signals(void); extern void do_group_exit(int); -extern void daemonize(const char *, ...); extern int allow_signal(int); extern int disallow_signal(int); diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092..f9275e2c7c2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -322,43 +322,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } } -/** - * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to kthreadd so it - * isn't in the way of other processes and is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited from a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_kthreadd() gives the caller full capabilities. - */ -static void reparent_to_kthreadd(void) -{ - write_lock_irq(&tasklist_lock); - - ptrace_unlink(current); - /* Reparent to init */ - current->real_parent = current->parent = kthreadd_task; - list_move_tail(¤t->sibling, ¤t->real_parent->children); - - /* Set the exit signal to SIGCHLD so we signal init on exit */ - current->exit_signal = SIGCHLD; - - if (task_nice(current) < 0) - set_user_nice(current, 0); - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - memcpy(current->signal->rlim, init_task.signal->rlim, - sizeof(current->signal->rlim)); - - atomic_inc(&init_cred.usage); - commit_creds(&init_cred); - write_unlock_irq(&tasklist_lock); -} - void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; @@ -370,13 +333,6 @@ void __set_special_pids(struct pid *pid) change_pid(curr, PIDTYPE_PGID, pid); } -static void set_special_pids(struct pid *pid) -{ - write_lock_irq(&tasklist_lock); - __set_special_pids(pid); - write_unlock_irq(&tasklist_lock); -} - /* * Let kernel threads use this to say that they allow a certain signal. * Must not be used if kthread was cloned with CLONE_SIGHAND. @@ -416,54 +372,6 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ - -void daemonize(const char *name, ...) -{ - va_list args; - sigset_t blocked; - - va_start(args, name); - vsnprintf(current->comm, sizeof(current->comm), name, args); - va_end(args); - - /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. - */ - exit_mm(current); - /* - * We don't want to get frozen, in case system-wide hibernation - * or suspend transition begins right now. - */ - current->flags |= (PF_NOFREEZE | PF_KTHREAD); - - if (current->nsproxy != &init_nsproxy) { - get_nsproxy(&init_nsproxy); - switch_task_namespaces(current, &init_nsproxy); - } - set_special_pids(&init_struct_pid); - proc_clear_tty(current); - - /* Block and flush all signals */ - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - - /* Become as one with the init task */ - - daemonize_fs_struct(); - daemonize_descriptors(); - - reparent_to_kthreadd(); -} - -EXPORT_SYMBOL(daemonize); - #ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. -- cgit v1.2.3-70-g09d2 From da3d4c5fa56236dd924d77ffc4f982356816b93b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 20 Oct 2012 21:49:33 -0400 Subject: get rid of pt_regs argument of do_execve() Signed-off-by: Al Viro --- fs/exec.c | 16 ++++++---------- include/linux/sched.h | 2 +- include/linux/syscalls.h | 3 +-- 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/exec.c b/fs/exec.c index f86b6cc2d6c..5797ed07efd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1566,12 +1566,11 @@ out_ret: int do_execve(const char *filename, const char __user *const __user *__argv, - const char __user *const __user *__envp, - struct pt_regs *regs) + const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp, regs); + return do_execve_common(filename, argv, envp, current_pt_regs()); } #ifdef CONFIG_COMPAT @@ -1668,7 +1667,7 @@ SYSCALL_DEFINE3(execve, struct filename *path = getname(filename); int error = PTR_ERR(path); if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp, current_pt_regs()); + error = do_execve(path->name, argv, envp); putname(path); } return error; @@ -1694,12 +1693,9 @@ int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]) { - struct pt_regs *p = current_pt_regs(); - int ret; - - ret = do_execve(filename, + int ret = do_execve(filename, (const char __user *const __user *)argv, - (const char __user *const __user *)envp, p); + (const char __user *const __user *)envp); if (ret < 0) return ret; @@ -1707,6 +1703,6 @@ int kernel_execve(const char *filename, * We were successful. We won't be returning to our caller, but * instead to user space by manipulating the kernel stack. */ - ret_from_kernel_execve(p); + ret_from_kernel_execve(current_pt_regs()); } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index a0166481eb2..c57249782e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2288,7 +2288,7 @@ extern int disallow_signal(int); extern int do_execve(const char *, const char __user * const __user *, - const char __user * const __user *, struct pt_regs *); + const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); #ifdef CONFIG_GENERIC_KERNEL_THREAD diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2779009ceaa..526deb333b9 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -833,8 +833,7 @@ int kernel_execve(const char *filename, const char *const argv[], const char *co #define kernel_execve(filename, argv, envp) \ do_execve(filename, \ (const char __user *const __user *)argv, \ - (const char __user *const __user *)envp, \ - current_pt_regs()) + (const char __user *const __user *)envp) #endif asmlinkage long sys_execve(const char __user *filename, -- cgit v1.2.3-70-g09d2 From afa86fc426ff7e7f5477f15da9c405d08d5cf790 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:51:14 -0400 Subject: flagday: don't pass regs to copy_thread() Signed-off-by: Al Viro --- arch/alpha/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 3 +-- arch/avr32/kernel/process.c | 2 +- arch/blackfin/kernel/process.c | 6 +++--- arch/c6x/kernel/process.c | 2 +- arch/cris/arch-v10/kernel/process.c | 3 +-- arch/cris/arch-v32/kernel/process.c | 3 +-- arch/frv/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/hexagon/kernel/process.c | 3 +-- arch/ia64/kernel/process.c | 3 ++- arch/m32r/kernel/process.c | 2 +- arch/m68k/kernel/process.c | 3 +-- arch/microblaze/kernel/process.c | 3 +-- arch/mips/kernel/process.c | 4 ++-- arch/mn10300/kernel/process.c | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 5 ++--- arch/powerpc/kernel/process.c | 4 ++-- arch/s390/kernel/process.c | 3 +-- arch/score/kernel/process.c | 4 ++-- arch/sh/kernel/process_32.c | 3 +-- arch/sh/kernel/process_64.c | 5 ++--- arch/sparc/kernel/process_32.c | 5 ++--- arch/sparc/kernel/process_64.c | 4 ++-- arch/tile/kernel/process.c | 5 ++--- arch/um/kernel/process.c | 3 +-- arch/unicore32/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 3 +-- arch/x86/kernel/process_64.c | 3 +-- arch/xtensa/kernel/process.c | 3 +-- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- 34 files changed, 45 insertions(+), 60 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e9705bcc96f..b5d0d092369 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -241,7 +241,7 @@ release_thread(struct task_struct *dead_task) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *wontuse) + struct task_struct *p) { extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 4ab80bbb6d9..9800338c5d1 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,7 +376,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5a1335caf6f..cb0956bc96e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -234,8 +234,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, - struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); unsigned long tls = p->thread.tp_value; diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 03d7aa4a4bc..fd78f58ea79 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -299,7 +299,7 @@ asmlinkage void syscall_return(void); int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index e5ae8fcab43..582276efaaa 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -141,14 +141,14 @@ asmlinkage int bfin_clone(unsigned long clone_flags, unsigned long newsp) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, - struct task_struct *p, struct pt_regs *regs) + struct task_struct *p) { struct pt_regs *childregs; unsigned long *v; childregs = (struct pt_regs *) (task_stack_page(p) + THREAD_SIZE) - 1; v = ((unsigned long *)childregs) - 2; - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); v[0] = usp; v[1] = topstk; @@ -157,7 +157,7 @@ copy_thread(unsigned long clone_flags, __asm__ __volatile__("%0 = syscfg;":"=da"(childregs->syscfg):); p->thread.usp = 0; } else { - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->r0 = 0; p->thread.usp = usp ? : rdusp(); v[0] = v[1] = 0; diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c index a3f91895e8b..6434df476f7 100644 --- a/arch/c6x/kernel/process.c +++ b/arch/c6x/kernel/process.c @@ -139,7 +139,7 @@ void start_thread(struct pt_regs *regs, unsigned int pc, unsigned long usp) */ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long ustk_size, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs; diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 520547c8b19..b1018750cff 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -94,8 +94,7 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct switch_stack *swstack = ((struct switch_stack *)childregs) - 1; diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index 331e70252df..2b23ef0e445 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -109,8 +109,7 @@ extern asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct switch_stack *swstack = ((struct switch_stack *) childregs) - 1; diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 0039bf77b19..23916b2a12a 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -144,7 +144,7 @@ inline unsigned long user_stack(const struct pt_regs *regs) */ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs; diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index b0fb4054aee..b609f63f159 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -129,7 +129,7 @@ void flush_thread(void) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, - struct task_struct * p, struct pt_regs *unused) + struct task_struct * p) { struct pt_regs * childregs; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 36dce17ed25..06ae9ffcabd 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -87,8 +87,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) * Copy architecture-specific thread state */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, - struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 25543a295ad..31360cbbd5f 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -393,12 +393,13 @@ ia64_load_extra (struct task_struct *task) int copy_thread(unsigned long clone_flags, unsigned long user_stack_base, unsigned long user_stack_size, - struct task_struct *p, struct pt_regs *regs) + struct task_struct *p) { extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; unsigned long rbs, child_rbs, rbs_size; struct pt_regs *child_ptregs; + struct pt_regs *regs = current_pt_regs(); int retval = 0; child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index c37e9a9a8f2..765d0f57c78 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -192,7 +192,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) } int copy_thread(unsigned long clone_flags, unsigned long spu, - unsigned long arg, struct task_struct *tsk, struct pt_regs *unused) + unsigned long arg, struct task_struct *tsk) { struct pt_regs *childregs = task_pt_regs(tsk); extern void ret_from_fork(void); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index aa9b1100027..9a3df4df73c 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -154,8 +154,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct * p, struct pt_regs * unused) + unsigned long arg, struct task_struct *p) { struct fork_frame { struct switch_stack sw; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a5fed8db726..40823fd1db0 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -120,8 +120,7 @@ void flush_thread(void) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index d13720ac656..38097652d62 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -114,10 +114,10 @@ void flush_thread(void) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); unsigned long childksp; p->set_child_tid = p->clear_child_tid = NULL; diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 5e0ef396458..eb09f5a552f 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -206,7 +206,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(unsigned long clone_flags, unsigned long c_usp, unsigned long ustk_size, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *c_regs; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 6b853668369..00c233bf0d0 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -142,7 +142,7 @@ extern asmlinkage void ret_from_fork(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *userregs; struct pt_regs *kregs; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 9753ecf49a0..d13507246c5 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -204,10 +204,9 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r) int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { - struct pt_regs * cregs = &(p->thread.regs); + struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); /* We have to use void * instead of a function pointer, because diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a3143756763..81430674e71 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -733,8 +733,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, - struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -759,6 +758,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, ti->flags |= _TIF_RESTOREALL; f = ret_from_kernel_thread; } else { + struct pt_regs *regs = current_pt_regs(); CHECK_FULL_REGS(regs); *childregs = *regs; if (usp) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e37677796a0..536d64579d9 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -117,8 +117,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti; struct fake_frame diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index f96379a5aee..79568466b57 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -87,11 +87,11 @@ void flush_thread(void) {} * set up the kernel stack and exception frames for a new process */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); + struct pt_regs *regs = current_pt_regs(); p->thread.reg0 = (unsigned long) childregs; if (unlikely(p->flags & PF_KTHREAD)) { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 1786d16b6c6..73eb66fc625 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -128,8 +128,7 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index d5c86a8a384..e611c85144b 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -371,10 +371,9 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); #ifdef CONFIG_SH_FPU /* can't happen for a kernel thread */ diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index bf4c6addce7..ecde946ef83 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -319,11 +319,10 @@ extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); char *new_stack; #ifndef CONFIG_SMP diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index dff54f46728..58ef19e7e82 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -622,10 +622,10 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, * Child --> %o0 == parents pid, %o1 == 1 */ int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *t = task_thread_info(p); + struct pt_regs *regs = current_pt_regs(); struct sparc_stackf *parent_sf; unsigned long child_stack_sz; char *child_trap_frame; diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 267936b51b5..0e5661e7d00 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -157,10 +157,9 @@ void arch_release_thread_info(struct thread_info *info) static void save_arch_state(struct thread_struct *t); int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs = task_pt_regs(p); + struct pt_regs *childregs = task_pt_regs(p), *regs = current_pt_regs(); unsigned long ksp; unsigned long *callee_regs; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index c502c804e8b..b462b13c5ba 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -161,8 +161,7 @@ void fork_handler(void) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p, - struct pt_regs *regs) + unsigned long arg, struct task_struct * p) { void (*handler)(void); int kthread = current->flags & PF_KTHREAD; diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 79e44e8ae31..62bad9fed03 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -262,7 +262,7 @@ asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 16efa974532..b5a8905785e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 74aac76c6e3..6e68a619496 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { int err; struct pt_regs *childregs; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 0036c14739f..1accf28da5f 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -199,8 +199,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, - unsigned long thread_fn_arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long thread_fn_arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/include/linux/sched.h b/include/linux/sched.h index c57249782e4..78a2ae3470d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2271,7 +2271,7 @@ extern void mm_release(struct task_struct *, struct mm_struct *); extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *, struct pt_regs *); + struct task_struct *); extern void flush_thread(void); extern void exit_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 27a337549da..d96a562b131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1320,7 +1320,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3-70-g09d2 From e80d6661c3a5caa0cebec0853c6cb0db090fb506 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 23:10:08 -0400 Subject: flagday: kill pt_regs argument of do_fork() Signed-off-by: Al Viro --- arch/blackfin/kernel/process.c | 2 +- arch/ia64/kernel/entry.S | 14 ++++++-------- arch/m68k/kernel/process.c | 2 +- arch/mips/kernel/linux32.c | 2 +- arch/mips/kernel/syscall.c | 4 ++-- arch/sparc/kernel/process_32.c | 3 +-- arch/sparc/kernel/process_64.c | 3 +-- include/linux/sched.h | 2 +- kernel/fork.c | 13 +++++-------- 9 files changed, 19 insertions(+), 26 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 582276efaaa..3e16ad9b0a9 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -135,7 +135,7 @@ asmlinkage int bfin_clone(unsigned long clone_flags, unsigned long newsp) #endif if (newsp) newsp -= 12; - return do_fork(clone_flags, newsp, regs, 0, NULL, NULL); + return do_fork(clone_flags, newsp, 0, NULL, NULL); } int diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 940a6726362..e25b784a2b7 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -116,13 +116,12 @@ GLOBAL_ENTRY(sys_clone2) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=in2 + mov out2=in2 tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in5 // store TLS in r16 for copy_thread() - mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret1: .restore sp @@ -148,13 +147,12 @@ GLOBAL_ENTRY(sys_clone) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=16 // stacksize (compensates for 16-byte scratch area) + mov out2=16 // stacksize (compensates for 16-byte scratch area) tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in4 // store TLS in r13 (tp) - mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret2: .restore sp diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 9a3df4df73c..d538694ad20 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -149,7 +149,7 @@ void flush_thread(void) asmlinkage int m68k_clone(struct pt_regs *regs) { /* regs will be equal to current_pt_regs() */ - return do_fork(regs->d1, regs->d2, regs, 0, + return do_fork(regs->d1, regs->d2, 0, (int __user *)regs->d3, (int __user *)regs->d4); } diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 8796dbc7e35..7adab86c632 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -312,7 +312,7 @@ _sys32_clone(nabi_no_regargs struct pt_regs regs) /* Use __dummy4 instead of getting it off the stack, so that syscall() works. */ child_tidptr = (int __user *) __dummy4; - return do_fork(clone_flags, newsp, ®s, 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index c611e2df776..201cb76b4df 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -92,7 +92,7 @@ save_static_function(sys_fork); static int __used noinline _sys_fork(nabi_no_regargs struct pt_regs regs) { - return do_fork(SIGCHLD, regs.regs[29], ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.regs[29], 0, NULL, NULL); } save_static_function(sys_clone); @@ -123,7 +123,7 @@ _sys_clone(nabi_no_regargs struct pt_regs regs) #else child_tidptr = (int __user *) regs.regs[8]; #endif - return do_fork(clone_flags, newsp, ®s, 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index ecde946ef83..be8e862bada 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -286,8 +286,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags, parent_tid_ptr = regs->u_regs[UREG_I2]; child_tid_ptr = regs->u_regs[UREG_I4]; - ret = do_fork(clone_flags, stack_start, - regs, stack_size, + ret = do_fork(clone_flags, stack_start, stack_size, (int __user *) parent_tid_ptr, (int __user *) child_tid_ptr); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 58ef19e7e82..cdb80b2adbe 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -601,8 +601,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, child_tid_ptr = (int __user *) regs->u_regs[UREG_I4]; } - ret = do_fork(clone_flags, stack_start, - regs, stack_size, + ret = do_fork(clone_flags, stack_start, stack_size, parent_tid_ptr, child_tid_ptr); /* If we get an error and potentially restart the system diff --git a/include/linux/sched.h b/include/linux/sched.h index 78a2ae3470d..1162258bcaf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2289,7 +2289,7 @@ extern int disallow_signal(int); extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *); -extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); #ifdef CONFIG_GENERIC_KERNEL_THREAD extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/kernel/fork.c b/kernel/fork.c index 0e68b6686ac..54073078343 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,8 +1527,6 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; - struct pt_regs regs; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); @@ -1546,7 +1544,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) */ long do_fork(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) @@ -1576,7 +1573,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { + if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1632,7 +1629,7 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } #endif @@ -1641,7 +1638,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, current_pt_regs(), 0, NULL, NULL); + return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ return(-EINVAL); @@ -1652,7 +1649,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, current_pt_regs(), + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif @@ -1675,7 +1672,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, current_pt_regs(), 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif -- cgit v1.2.3-70-g09d2 From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- arch/sh/mm/Kconfig | 1 + arch/x86/Kconfig | 2 + include/linux/mm_types.h | 11 +++++ include/linux/sched.h | 20 ++++++++ kernel/sched/core.c | 13 +++++ kernel/sched/fair.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/features.h | 7 +++ kernel/sched/sched.h | 6 +++ kernel/sysctl.c | 24 ++++++++- mm/huge_memory.c | 5 +- mm/memory.c | 14 +++++- 11 files changed, 224 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4d..0f7c852f355 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -111,6 +111,7 @@ config VSYSCALL config NUMA bool "Non Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL + select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n help Some SH systems have many various memories scattered around diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 46c3bff3ced..1137028fc6d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,8 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 31f8a3af7d9..ed8638c29b3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -397,6 +397,17 @@ struct mm_struct { #endif #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; +#endif +#ifdef CONFIG_NUMA_BALANCING + /* + * numa_next_scan is the next time when the PTEs will me marked + * pte_numa to gather statistics and migrate pages to new nodes + * if necessary + */ + unsigned long numa_next_scan; + + /* numa_scan_seq prevents two threads setting pte_numa */ + int numa_scan_seq; #endif struct uprobes_state uprobes_state; }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..844af5b12cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1479,6 +1479,14 @@ struct task_struct { short il_next; short pref_node_fork; #endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + int numa_migrate_seq; + unsigned int numa_scan_period; + u64 node_stamp; /* migration stamp */ + struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ + struct rcu_head rcu; /* @@ -1553,6 +1561,14 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages); +#else +static inline void task_numa_fault(int node, int pages) +{ +} +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -1990,6 +2006,10 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_settle_count; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda71..cad0d092ce3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif + +#ifdef CONFIG_NUMA_BALANCING + if (p->mm && atomic_read(&p->mm->mm_users) == 1) { + p->mm->numa_next_scan = jiffies; + p->mm->numa_scan_seq = 0; + } + + p->node_stamp = 0ULL; + p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; + p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_work.next = &p->numa_work; +#endif /* CONFIG_NUMA_BALANCING */ } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b99..6831abb5dbe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include @@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA_BALANCING +/* + * numa task sample period in ms: 5s + */ +unsigned int sysctl_numa_balancing_scan_period_min = 5000; +unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; + +static void task_numa_placement(struct task_struct *p) +{ + int seq = ACCESS_ONCE(p->mm->numa_scan_seq); + + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + + /* FIXME: Scheduling placement policy hints go here */ +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int node, int pages) +{ + struct task_struct *p = current; + + /* FIXME: Allocate task-specific structure for placement policy here */ + + task_numa_placement(p); +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + + WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; /* protect against double add */ + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + + next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + ACCESS_ONCE(mm->numa_scan_seq)++; + { + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + change_prot_numa(vma, vma->vm_start, vma->vm_end); + } + up_read(&mm->mmap_sem); + } +} + +/* + * Drive the periodic memory faults.. + */ +void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now - curr->node_stamp > period) { + curr->node_stamp = now; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) { + init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ + task_work_add(curr, work, true); + } + } +} +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } + + if (sched_feat_numa(NUMA)) + task_tick_numa(rq, curr); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad702..5fb7aefbec8 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) + +/* + * Apply the automatic NUMA scheduling policy + */ +#ifdef CONFIG_NUMA_BALANCING +SCHED_FEAT(NUMA, true) +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfab..ae31c051ff2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#ifdef CONFIG_NUMA_BALANCING +#define sched_feat_numa(x) sched_feat(x) +#else +#define sched_feat_numa(x) (0) +#endif + static inline u64 global_rt_period(void) { return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f..025e1ae50ef 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d79f7a55bf6..ee8133794a5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, */ split_huge_page(page); put_page(page); + return 0; clear_pmdnuma: @@ -1060,8 +1061,10 @@ clear_pmdnuma: out_unlock: spin_unlock(&mm->page_table_lock); - if (page) + if (page) { put_page(page); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + } return 0; } diff --git a/mm/memory.c b/mm/memory.c index d52542680e1..8012c190789 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *page = NULL; spinlock_t *ptl; - int current_nid, target_nid; + int current_nid = -1; + int target_nid; /* * The "pte" at this point cannot be used safely without @@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, current_nid = target_nid; out: + task_numa_fault(current_nid, 1); return 0; } @@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { pte_t pteval = *pte; struct page *page; + int curr_nid; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = vm_normal_page(vma, addr, pteval); if (unlikely(!page)) continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + pte_unmap_unlock(pte, ptl); + + curr_nid = page_to_nid(page); + task_numa_fault(curr_nid, 1); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } pte_unmap_unlock(orig_pte, ptl); -- cgit v1.2.3-70-g09d2 From 6e5fb223e89dbe5cb5c563f8d4a4a0a7d62455a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:45 +0200 Subject: mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate Previously, to probe the working set of a task, we'd use a very simple and crude method: mark all of its address space PROT_NONE. That method has various (obvious) disadvantages: - it samples the working set at dissimilar rates, giving some tasks a sampling quality advantage over others. - creates performance problems for tasks with very large working sets - over-samples processes with large address spaces but which only very rarely execute Improve that method by keeping a rotating offset into the address space that marks the current position of the scan, and advance it by a constant rate (in a CPU cycles execution proportional manner). If the offset reaches the last mapped address of the mm then it then it starts over at the first address. The per-task nature of the working set sampling functionality in this tree allows such constant rate, per task, execution-weight proportional sampling of the working set, with an adaptive sampling interval/frequency that goes from once per 100ms up to just once per 8 seconds. The current sampling volume is 256 MB per interval. As tasks mature and converge their working set, so does the sampling rate slow down to just a trickle, 256 MB per 8 seconds of CPU time executed. This, beyond being adaptive, also rate-limits rarely executing systems and does not over-sample on overloaded systems. [ In AutoNUMA speak, this patch deals with the effective sampling rate of the 'hinting page fault'. AutoNUMA's scanning is currently rate-limited, but it is also fundamentally single-threaded, executing in the knuma_scand kernel thread, so the limit in AutoNUMA is global and does not scale up with the number of CPUs, nor does it scan tasks in an execution proportional manner. So the idea of rate-limiting the scanning was first implemented in the AutoNUMA tree via a global rate limit. This patch goes beyond that by implementing an execution rate proportional working set sampling rate that is not implemented via a single global scanning daemon. ] [ Dan Carpenter pointed out a possible NULL pointer dereference in the first version of this patch. ] Based-on-idea-by: Andrea Arcangeli Bug-Found-By: Dan Carpenter Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote changelog and fixed bug. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/mm_types.h | 3 +++ include/linux/sched.h | 1 + kernel/sched/fair.c | 65 ++++++++++++++++++++++++++++++++++++++---------- kernel/sysctl.c | 7 ++++++ 4 files changed, 63 insertions(+), 13 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ed8638c29b3..d1e246c5e50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,6 +406,9 @@ struct mm_struct { */ unsigned long numa_next_scan; + /* Restart point for scanning and setting pte_numa */ + unsigned long numa_scan_offset; + /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 844af5b12cb..37841958d23 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2008,6 +2008,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6831abb5dbe..0a349dd1fa6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -780,10 +780,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_NUMA_BALANCING /* - * numa task sample period in ms: 5s + * numa task sample period in ms */ -unsigned int sysctl_numa_balancing_scan_period_min = 5000; -unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; +unsigned int sysctl_numa_balancing_scan_period_min = 100; +unsigned int sysctl_numa_balancing_scan_period_max = 100*16; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; static void task_numa_placement(struct task_struct *p) { @@ -808,6 +811,12 @@ void task_numa_fault(int node, int pages) task_numa_placement(p); } +static void reset_ptenuma_scan(struct task_struct *p) +{ + ACCESS_ONCE(p->mm->numa_scan_seq)++; + p->mm->numa_scan_offset = 0; +} + /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). @@ -817,6 +826,9 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + unsigned long offset, end; + long length; WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -846,18 +858,45 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; - ACCESS_ONCE(mm->numa_scan_seq)++; - { - struct vm_area_struct *vma; + offset = mm->numa_scan_offset; + length = sysctl_numa_balancing_scan_size; + length <<= 20; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma_migratable(vma)) - continue; - change_prot_numa(vma, vma->vm_start, vma->vm_end); - } - up_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); + vma = find_vma(mm, offset); + if (!vma) { + reset_ptenuma_scan(p); + offset = 0; + vma = mm->mmap; + } + for (; vma && length > 0; vma = vma->vm_next) { + if (!vma_migratable(vma)) + continue; + + /* Skip small VMAs. They are not likely to be of relevance */ + if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) + continue; + + offset = max(offset, vma->vm_start); + end = min(ALIGN(offset + length, HPAGE_SIZE), vma->vm_end); + length -= end - offset; + + change_prot_numa(vma, offset, end); + + offset = end; } + + /* + * It is possible to reach the end of the VMA list but the last few VMAs are + * not guaranteed to the vma_migratable. If they are not, we would find the + * !migratable VMA on the next scan but not reset the scanner to the start + * so check it now. + */ + if (vma) + mm->numa_scan_offset = offset; + else + reset_ptenuma_scan(p); + up_read(&mm->mmap_sem); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 025e1ae50ef..7d3a2e0475e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3-70-g09d2 From 4b96a29ba891dd59734cb7be80a900fe93aa2d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:47 +0200 Subject: mm: sched: numa: Implement slow start for working set sampling Add a 1 second delay before starting to scan the working set of a task and starting to balance it amongst nodes. [ note that before the constant per task WSS sampling rate patch the initial scan would happen much later still, in effect that patch caused this regression. ] The theory is that short-run tasks benefit very little from NUMA placement: they come and go, and they better stick to the node they were started on. As tasks mature and rebalance to other CPUs and nodes, so does their NUMA placement have to change and so does it start to matter more and more. In practice this change fixes an observable kbuild regression: # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ] !NUMA: 45.291088843 seconds time elapsed ( +- 0.40% ) 45.154231752 seconds time elapsed ( +- 0.36% ) +NUMA, no slow start: 46.172308123 seconds time elapsed ( +- 0.30% ) 46.343168745 seconds time elapsed ( +- 0.25% ) +NUMA, 1 sec slow start: 45.224189155 seconds time elapsed ( +- 0.25% ) 45.160866532 seconds time elapsed ( +- 0.17% ) and it also fixes an observable perf bench (hackbench) regression: # perf stat --null --repeat 10 perf bench sched messaging -NUMA: -NUMA: 0.246225691 seconds time elapsed ( +- 1.31% ) +NUMA no slow start: 0.252620063 seconds time elapsed ( +- 1.13% ) +NUMA 1sec delay: 0.248076230 seconds time elapsed ( +- 1.35% ) The implementation is simple and straightforward, most of the patch deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable knob. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote the changelog, ran measurements, tuned the default. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/sched.h | 1 + kernel/sched/core.c | 2 +- kernel/sched/fair.c | 5 +++++ kernel/sysctl.c | 7 +++++++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 37841958d23..7d95a232b5b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2006,6 +2006,7 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cad0d092ce3..fbfc4843063 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1543,7 +1543,7 @@ static void __sched_fork(struct task_struct *p) p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; #endif /* CONFIG_NUMA_BALANCING */ } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6e1f25ed2b..7727b016157 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -788,6 +788,9 @@ unsigned int sysctl_numa_balancing_scan_period_max = 100*16; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + static void task_numa_placement(struct task_struct *p) { int seq = ACCESS_ONCE(p->mm->numa_scan_seq); @@ -929,6 +932,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; if (now - curr->node_stamp > period) { + if (!curr->node_stamp) + curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; curr->node_stamp = now; if (!time_before(jiffies, curr->mm->numa_next_scan)) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d3a2e0475e..48a68cc258c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -352,6 +352,13 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_min_ms", .data = &sysctl_numa_balancing_scan_period_min, -- cgit v1.2.3-70-g09d2 From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- include/linux/mm_types.h | 3 +++ include/linux/sched.h | 5 +++-- kernel/sched/core.c | 1 + kernel/sched/fair.c | 29 +++++++++++++++++++++-------- kernel/sysctl.c | 7 +++++++ mm/huge_memory.c | 2 +- mm/memory.c | 12 ++++++++---- 7 files changed, 44 insertions(+), 15 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c5fffa23986..e850a23dd6e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,6 +410,9 @@ struct mm_struct { */ unsigned long numa_next_scan; + /* numa_next_reset is when the PTE scanner period will be reset */ + unsigned long numa_next_reset; + /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d95a232b5b..0f4ff2bd03f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1562,9 +1562,9 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages); +extern void task_numa_fault(int node, int pages, bool migrated); #else -static inline void task_numa_fault(int node, int pages) +static inline void task_numa_fault(int node, int pages, bool migrated) { } #endif @@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fbfc4843063..9d255bc0e27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies; + p->mm->numa_next_reset = jiffies; p->mm->numa_scan_seq = 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd18087fd36..4b577863933 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * numa task sample period in ms */ unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*16; +unsigned int sysctl_numa_balancing_scan_period_max = 100*50; +unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = 256; @@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p) /* * Got a PROT_NONE fault for a page on @node. */ -void task_numa_fault(int node, int pages) +void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; /* FIXME: Allocate task-specific structure for placement policy here */ /* - * Assume that as faults occur that pages are getting properly placed - * and fewer NUMA hints are required. Note that this is a big - * assumption, it assumes processes reach a steady steady with no - * further phase changes. + * If pages are properly placed (did not migrate) then scan slower. + * This is reset periodically in case of phase changes */ - p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, - p->numa_scan_period + jiffies_to_msecs(2)); + if (!migrated) + p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, + p->numa_scan_period + jiffies_to_msecs(10)); task_numa_placement(p); } @@ -857,6 +857,19 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * Reset the scan period if enough time has gone by. Objective is that + * scanning will be reduced if pages are properly placed. As tasks + * can enter different phases this needs to be re-examined. Lacking + * proper tracking of reference behaviour, this blunt hammer is used. + */ + migrate = mm->numa_next_reset; + if (time_after(now, migrate)) { + p->numa_scan_period = sysctl_numa_balancing_scan_period_min; + next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); + xchg(&mm->numa_next_reset, next_scan); + } + /* * Enforce maximal scan/migration frequency.. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48a68cc258c..8906f90d6fa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 79b96064f8f..199b261a257 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1068,7 +1068,7 @@ out_unlock: spin_unlock(&mm->page_table_lock); if (page) { put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR); + task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); } return 0; } diff --git a/mm/memory.c b/mm/memory.c index 84c6d9eab18..39edb11b63d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int current_nid = -1; int target_nid; + bool migrated = false; /* * The "pte" at this point cannot be used safely without @@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) current_nid = target_nid; out: if (current_nid != -1) - task_numa_fault(current_nid, 1); + task_numa_fault(current_nid, 1, migrated); return 0; } @@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; int curr_nid = local_nid; int target_nid; + bool migrated; if (!pte_present(pteval)) continue; if (!pte_numa(pteval)) @@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Migrate to the requested node */ pte_unmap_unlock(pte, ptl); - if (migrate_misplaced_page(page, target_nid)) + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) curr_nid = target_nid; - task_numa_fault(curr_nid, 1); + task_numa_fault(curr_nid, 1, migrated); pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); } -- cgit v1.2.3-70-g09d2 From 1a687c2e9a99335c9e77392f050fe607fa18a652 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 11:16:36 +0000 Subject: mm: sched: numa: Control enabling and disabling of NUMA balancing This patch adds Kconfig options and kernel parameters to allow the enabling and disabling of automatic NUMA balancing. The existance of such a switch was and is very important when debugging problems related to transparent hugepages and we should have the same for automatic NUMA placement. Signed-off-by: Mel Gorman --- Documentation/kernel-parameters.txt | 3 +++ include/linux/sched.h | 4 ++++ init/Kconfig | 8 +++++++ kernel/sched/core.c | 48 +++++++++++++++++++++++++------------ kernel/sched/fair.c | 3 +++ kernel/sched/features.h | 6 +++-- mm/mempolicy.c | 46 +++++++++++++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 17 deletions(-) (limited to 'include/linux/sched.h') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9776f068306..2e8d2625b81 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1996,6 +1996,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. + numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. + Allowed values are enable and disable + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/include/linux/sched.h b/include/linux/sched.h index 0f4ff2bd03f..b1e619f9ff1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1563,10 +1563,14 @@ struct task_struct { #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int node, int pages, bool migrated) { } +static inline void set_numabalancing_state(bool enabled) +{ +} #endif /* diff --git a/init/Kconfig b/init/Kconfig index 9f00f004796..18e2a5920a3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -720,6 +720,14 @@ config ARCH_USES_NUMA_PROT_NONE depends on ARCH_WANTS_PROT_NUMA_PROT_NONE depends on NUMA_BALANCING +config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y + depends on NUMA_BALANCING + help + If set, autonumic NUMA balancing will be enabled if running on a NUMA + machine. + config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" default y diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d255bc0e27..7a45015274a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -192,23 +192,10 @@ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; #endif /* HAVE_JUMP_LABEL */ -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int sched_feat_set(char *cmp) { - char buf[64]; - char *cmp; - int neg = 0; int i; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); + int neg = 0; if (strncmp(cmp, "NO_", 3) == 0) { neg = 1; @@ -228,6 +215,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } } + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + i = sched_feat_set(cmp); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -1549,6 +1557,16 @@ static void __sched_fork(struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +#ifdef CONFIG_NUMA_BALANCING +void set_numabalancing_state(bool enabled) +{ + if (enabled) + sched_feat_set("NUMA"); + else + sched_feat_set("NO_NUMA"); +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * fork()/clone()-time setup: */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4b577863933..7a02a2082e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -811,6 +811,9 @@ void task_numa_fault(int node, int pages, bool migrated) { struct task_struct *p = current; + if (!sched_feat_numa(NUMA)) + return; + /* FIXME: Allocate task-specific structure for placement policy here */ /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5fb7aefbec8..d2373a3e325 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,8 +63,10 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) /* - * Apply the automatic NUMA scheduling policy + * Apply the automatic NUMA scheduling policy. Enabled automatically + * at runtime if running on a NUMA machine. Can be controlled via + * numa_balancing= */ #ifdef CONFIG_NUMA_BALANCING -SCHED_FEAT(NUMA, true) +SCHED_FEAT(NUMA, false) #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fd20e28fd2a..046308e9b99 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2521,6 +2521,50 @@ void mpol_free_shared_policy(struct shared_policy *p) mutex_unlock(&p->mutex); } +#ifdef CONFIG_NUMA_BALANCING +static bool __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + if (nr_node_ids > 1 && !numabalancing_override) { + printk(KERN_INFO "Enabling automatic NUMA balancing. " + "Configure with numa_balancing= or sysctl"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + numabalancing_override = true; + + if (!strcmp(str, "enable")) { + set_numabalancing_state(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_numabalancing_state(false); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING "Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { @@ -2571,6 +2615,8 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + check_numabalancing_enable(); } /* Reset policy of current process to default */ -- cgit v1.2.3-70-g09d2 From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/staging/android/lowmemorykiller.c | 16 ++++++++-------- fs/proc/base.c | 10 +++++----- include/linux/oom.h | 4 ++-- include/linux/sched.h | 6 +++--- include/trace/events/oom.h | 4 ++-- include/trace/events/task.h | 8 ++++---- mm/ksm.c | 2 +- mm/oom_kill.c | 10 +++++----- mm/swapfile.c | 2 +- 9 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux/sched.h') diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c index b91e4bc332a..3b91b0fd4de 100644 --- a/drivers/staging/android/lowmemorykiller.c +++ b/drivers/staging/android/lowmemorykiller.c @@ -40,7 +40,7 @@ #include static uint32_t lowmem_debug_level = 2; -static int lowmem_adj[6] = { +static short lowmem_adj[6] = { 0, 1, 6, @@ -70,9 +70,9 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) int rem = 0; int tasksize; int i; - int min_score_adj = OOM_SCORE_ADJ_MAX + 1; + short min_score_adj = OOM_SCORE_ADJ_MAX + 1; int selected_tasksize = 0; - int selected_oom_score_adj; + short selected_oom_score_adj; int array_size = ARRAY_SIZE(lowmem_adj); int other_free = global_page_state(NR_FREE_PAGES); int other_file = global_page_state(NR_FILE_PAGES) - @@ -90,7 +90,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) } } if (sc->nr_to_scan > 0) - lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %d\n", + lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n", sc->nr_to_scan, sc->gfp_mask, other_free, other_file, min_score_adj); rem = global_page_state(NR_ACTIVE_ANON) + @@ -107,7 +107,7 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) rcu_read_lock(); for_each_process(tsk) { struct task_struct *p; - int oom_score_adj; + short oom_score_adj; if (tsk->flags & PF_KTHREAD) continue; @@ -141,11 +141,11 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc) selected = p; selected_tasksize = tasksize; selected_oom_score_adj = oom_score_adj; - lowmem_print(2, "select %d (%s), adj %d, size %d, to kill\n", + lowmem_print(2, "select %d (%s), adj %hd, size %d, to kill\n", p->pid, p->comm, oom_score_adj, tasksize); } if (selected) { - lowmem_print(1, "send sigkill to %d (%s), adj %d, size %d\n", + lowmem_print(1, "send sigkill to %d (%s), adj %hd, size %d\n", selected->pid, selected->comm, selected_oom_score_adj, selected_tasksize); lowmem_deathpending_timeout = jiffies + HZ; @@ -176,7 +176,7 @@ static void __exit lowmem_exit(void) } module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR); -module_param_array_named(adj, lowmem_adj, int, &lowmem_adj_size, +module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size, S_IRUGO | S_IWUSR); module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size, S_IRUGO | S_IWUSR); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e28356a959..aa63d25157b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -985,7 +985,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, { struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); char buffer[PROC_NUMBUF]; - int oom_score_adj = OOM_SCORE_ADJ_MIN; + short oom_score_adj = OOM_SCORE_ADJ_MIN; unsigned long flags; size_t len; @@ -996,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf, unlock_task_sighand(task, &flags); } put_task_struct(task); - len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } @@ -1043,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj_min && + if ((short)oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; } - task->signal->oom_score_adj = oom_score_adj; + task->signal->oom_score_adj = (short)oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) - task->signal->oom_score_adj_min = oom_score_adj; + task->signal->oom_score_adj_min = (short)oom_score_adj; trace_oom_score_adj_update(task); err_sighand: diff --git a/include/linux/oom.h b/include/linux/oom.h index 4a4188da457..922dab164eb 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -29,8 +29,8 @@ enum oom_scan_t { OOM_SCAN_SELECT, /* always select this thread first */ }; -extern void compare_swap_oom_score_adj(int old_val, int new_val); -extern int test_set_oom_score_adj(int new_val); +extern void compare_swap_oom_score_adj(short old_val, short new_val); +extern short test_set_oom_score_adj(short new_val); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/include/linux/sched.h b/include/linux/sched.h index 0dd42a02df2..ed30456152d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -631,9 +631,9 @@ struct signal_struct { struct rw_semaphore group_rwsem; #endif - int oom_score_adj; /* OOM kill score adjustment */ - int oom_score_adj_min; /* OOM kill score adjustment minimum value. - * Only settable by CAP_SYS_RESOURCE. */ + short oom_score_adj; /* OOM kill score adjustment */ + short oom_score_adj_min; /* OOM kill score adjustment min value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index dd4ba3b9200..1e974983757 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -14,7 +14,7 @@ TRACE_EVENT(oom_score_adj_update, TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -23,7 +23,7 @@ TRACE_EVENT(oom_score_adj_update, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s oom_score_adj=%d", + TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); diff --git a/include/trace/events/task.h b/include/trace/events/task.h index b53add02e92..102a646e199 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -15,7 +15,7 @@ TRACE_EVENT(task_newtask, __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) __field( unsigned long, clone_flags) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -25,7 +25,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); @@ -40,7 +40,7 @@ TRACE_EVENT(task_rename, __field( pid_t, pid) __array( char, oldcomm, TASK_COMM_LEN) __array( char, newcomm, TASK_COMM_LEN) - __field( int, oom_score_adj) + __field( short, oom_score_adj) ), TP_fast_assign( @@ -50,7 +50,7 @@ TRACE_EVENT(task_rename, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", __entry->pid, __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj) ); diff --git a/mm/ksm.c b/mm/ksm.c index 31ae5ea1eac..b4d5a9deb17 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,7 +1919,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - int oom_score_adj; + short oom_score_adj; oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = unmerge_and_remove_all_rmap_items(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7e9e9113bd0..37ab4c5ab6e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); * @old_val. Usually used to reinstate a previous value to prevent racing with * userspacing tuning the value in the interim. */ -void compare_swap_oom_score_adj(int old_val, int new_val) +void compare_swap_oom_score_adj(short old_val, short new_val) { struct sighand_struct *sighand = current->sighand; @@ -72,7 +72,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) * synchronization and returns the old value. Usually used to temporarily * set a value, save the old value in the caller, and then reinstate it later. */ -int test_set_oom_score_adj(int new_val) +short test_set_oom_score_adj(short new_val) { struct sighand_struct *sighand = current->sighand; int old_val; @@ -193,7 +193,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - adj = p->signal->oom_score_adj; + adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; @@ -399,7 +399,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task->mm->nr_ptes, @@ -415,7 +415,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%d\n", + "oom_score_adj=%hd\n", current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 0fbb45283c6..bb6f6a04e92 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int oom_score_adj; + short oom_score_adj; int i, type, prev; int err; -- cgit v1.2.3-70-g09d2 From e1e12d2f3104be886073ac6c5c4678f30b1b9e51 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:56 -0800 Subject: mm, oom: fix race when specifying a thread as the oom origin test_set_oom_score_adj() and compare_swap_oom_score_adj() are used to specify that current should be killed first if an oom condition occurs in between the two calls. The usage is short oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); ... compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); to store the thread's oom_score_adj, temporarily change it to the maximum score possible, and then restore the old value if it is still the same. This happens to still be racy, however, if the user writes OOM_SCORE_ADJ_MAX to /proc/pid/oom_score_adj in between the two calls. The compare_swap_oom_score_adj() will then incorrectly reset the old value prior to the write of OOM_SCORE_ADJ_MAX. To fix this, introduce a new oom_flags_t member in struct signal_struct that will be used for per-thread oom killer flags. KSM and swapoff can now use a bit in this member to specify that threads should be killed first in oom conditions without playing around with oom_score_adj. This also allows the correct oom_score_adj to always be shown when reading /proc/pid/oom_score. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 19 +++++++++++++++++-- include/linux/sched.h | 1 + include/linux/types.h | 1 + mm/ksm.c | 7 ++----- mm/oom_kill.c | 49 +++++++------------------------------------------ mm/swapfile.c | 5 ++--- 6 files changed, 30 insertions(+), 52 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 922dab164eb..da60007075b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -29,8 +29,23 @@ enum oom_scan_t { OOM_SCAN_SELECT, /* always select this thread first */ }; -extern void compare_swap_oom_score_adj(short old_val, short new_val); -extern short test_set_oom_score_adj(short new_val); +/* Thread is the potential origin of an oom condition; kill first on oom */ +#define OOM_FLAG_ORIGIN ((__force oom_flags_t)0x1) + +static inline void set_current_oom_origin(void) +{ + current->signal->oom_flags |= OOM_FLAG_ORIGIN; +} + +static inline void clear_current_oom_origin(void) +{ + current->signal->oom_flags &= ~OOM_FLAG_ORIGIN; +} + +static inline bool oom_task_origin(const struct task_struct *p) +{ + return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN); +} extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/include/linux/sched.h b/include/linux/sched.h index ed30456152d..3e387df065f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -631,6 +631,7 @@ struct signal_struct { struct rw_semaphore group_rwsem; #endif + oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ diff --git a/include/linux/types.h b/include/linux/types.h index 1cc0e4b9a04..4d118ba1134 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,6 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; +typedef unsigned __bitwise__ oom_flags_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 phys_addr_t; diff --git a/mm/ksm.c b/mm/ksm.c index b4d5a9deb17..382d930a0bf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - short oom_score_adj; - - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, - oom_score_adj); + clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 37ab4c5ab6e..18f1ae2b45d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj - * @old_val: old oom_score_adj for compare - * @new_val: new oom_score_adj for swap - * - * Sets the oom_score_adj value for current to @new_val iff its present value is - * @old_val. Usually used to reinstate a previous value to prevent racing with - * userspacing tuning the value in the interim. - */ -void compare_swap_oom_score_adj(short old_val, short new_val) -{ - struct sighand_struct *sighand = current->sighand; - - spin_lock_irq(&sighand->siglock); - if (current->signal->oom_score_adj == old_val) - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); -} - -/** - * test_set_oom_score_adj() - set current's oom_score_adj and return old value - * @new_val: new oom_score_adj value - * - * Sets the oom_score_adj value for current to @new_val with proper - * synchronization and returns the old value. Usually used to temporarily - * set a value, save the old value in the caller, and then reinstate it later. - */ -short test_set_oom_score_adj(short new_val) -{ - struct sighand_struct *sighand = current->sighand; - int old_val; - - spin_lock_irq(&sighand->siglock); - old_val = current->signal->oom_score_adj; - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); - - return old_val; -} - #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill @@ -310,6 +268,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + if (task->flags & PF_EXITING && !force_kill) { /* * If this task is not being ptraced on exit, then wait for it diff --git a/mm/swapfile.c b/mm/swapfile.c index bb6f6a04e92..e97a0e5aea9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - short oom_score_adj; int i, type, prev; int err; @@ -1557,9 +1556,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = try_to_unuse(type, false, 0); /* force all pages to be unused */ - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); + clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ -- cgit v1.2.3-70-g09d2 From a5ba911ec3792168530d35e16a8ec3b6fc60bcb5 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 17 Dec 2012 16:03:22 -0800 Subject: pidns: remove unused is_container_init() Since commit 1cdcbec1a337 ("CRED: Neuter sys_capset()") is_container_init() has no callers. Signed-off-by: Gao feng Cc: David Howells Acked-by: Serge Hallyn Cc: James Morris Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 ------ kernel/pid.c | 15 --------------- 2 files changed, 21 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b089c92c609..9914c662ed7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1778,12 +1778,6 @@ static inline int is_global_init(struct task_struct *tsk) return tsk->pid == 1; } -/* - * is_container_init: - * check whether in the task is init in its own pid namespace. - */ -extern int is_container_init(struct task_struct *tsk); - extern struct pid *cad_pid; extern void free_task(struct task_struct *tsk); diff --git a/kernel/pid.c b/kernel/pid.c index fd996c1ed9f..a54a1123c7c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -81,21 +81,6 @@ struct pid_namespace init_pid_ns = { }; EXPORT_SYMBOL_GPL(init_pid_ns); -int is_container_init(struct task_struct *tsk) -{ - int ret = 0; - struct pid *pid; - - rcu_read_lock(); - pid = task_pid(tsk); - if (pid != NULL && pid->numbers[pid->level].nr == 1) - ret = 1; - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(is_container_init); - /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). -- cgit v1.2.3-70-g09d2 From 0e9d92f2d02d8c8320f0502307c688d07bdac2b3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:42 -0800 Subject: memcg: skip memcg kmem allocations in specified code regions Create a mechanism that skip memcg allocations during certain pieces of our core code. It basically works in the same way as preempt_disable()/preempt_enable(): By marking a region under which all allocations will be accounted to the root memcg. We need this to prevent races in early cache creation, when we allocate data using caches that are not necessarily created already. Signed-off-by: Glauber Costa yCc: Christoph Lameter Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9914c662ed7..f712465b05c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1597,6 +1597,7 @@ struct task_struct { unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ } memcg_batch; + unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT atomic_t ptrace_bp_refcnt; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index efd26620a60..65302a083d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3025,6 +3025,37 @@ out: kfree(s->memcg_params); } +/* + * During the creation a new cache, we need to disable our accounting mechanism + * altogether. This is true even if we are not creating, but rather just + * enqueing new caches to be created. + * + * This is because that process will trigger allocations; some visible, like + * explicit kmallocs to auxiliary data structures, name strings and internal + * cache structures; some well concealed, like INIT_WORK() that can allocate + * objects during debug. + * + * If any allocation happens during memcg_kmem_get_cache, we will recurse back + * to it. This may not be a bounded recursion: since the first cache creation + * failed to complete (waiting on the allocation), we'll just try to create the + * cache again, failing at the same point. + * + * memcg_kmem_get_cache is prepared to abort after seeing a positive count of + * memcg_kmem_skip_account. So we enclose anything that might allocate memory + * inside the following two functions. + */ +static inline void memcg_stop_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account++; +} + +static inline void memcg_resume_kmem_account(void) +{ + VM_BUG_ON(!current->mm); + current->memcg_kmem_skip_account--; +} + static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) { char *name; @@ -3084,7 +3115,6 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out; new_cachep = kmem_cache_dup(memcg, cachep); - if (new_cachep == NULL) { new_cachep = cachep; goto out; @@ -3125,8 +3155,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) * Enqueue the creation of a per-memcg kmem_cache. * Called with rcu_read_lock. */ -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { struct create_work *cw; @@ -3147,6 +3177,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, schedule_work(&cw->work); } +static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, + struct kmem_cache *cachep) +{ + /* + * We need to stop accounting when we kmalloc, because if the + * corresponding kmalloc cache is not yet created, the first allocation + * in __memcg_create_cache_enqueue will recurse. + * + * However, it is better to enclose the whole function. Depending on + * the debugging options enabled, INIT_WORK(), for instance, can + * trigger an allocation. This too, will make us recurse. Because at + * this point we can't allow ourselves back into memcg_kmem_get_cache, + * the safest choice is to do it like this, wrapping the whole function. + */ + memcg_stop_kmem_account(); + __memcg_create_cache_enqueue(memcg, cachep); + memcg_resume_kmem_account(); +} /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. @@ -3169,6 +3217,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, VM_BUG_ON(!cachep->memcg_params); VM_BUG_ON(!cachep->memcg_params->is_root_cache); + if (!current->mm || current->memcg_kmem_skip_account) + return cachep; + rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From ae903caae267154de7cf8576b130ff474630596b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 12:44:11 -0500 Subject: Bury the conditionals from kernel_thread/kernel_execve series All architectures have CONFIG_GENERIC_KERNEL_THREAD CONFIG_GENERIC_KERNEL_EXECVE __ARCH_WANT_SYS_EXECVE None of them have __ARCH_WANT_KERNEL_EXECVE and there are only two callers of kernel_execve() (which is a trivial wrapper for do_execve() now) left. Kill the conditionals and make both callers use do_execve(). Signed-off-by: Al Viro --- arch/Kconfig | 6 ------ arch/alpha/Kconfig | 2 -- arch/alpha/include/asm/unistd.h | 1 - arch/arm/Kconfig | 2 -- arch/arm/include/asm/unistd.h | 1 - arch/arm64/Kconfig | 2 -- arch/arm64/include/asm/unistd.h | 1 - arch/avr32/Kconfig | 2 -- arch/avr32/include/asm/unistd.h | 1 - arch/blackfin/Kconfig | 2 -- arch/blackfin/include/asm/unistd.h | 1 - arch/c6x/Kconfig | 2 -- arch/c6x/include/uapi/asm/unistd.h | 1 - arch/cris/Kconfig | 2 -- arch/cris/include/asm/unistd.h | 1 - arch/frv/Kconfig | 2 -- arch/frv/include/asm/unistd.h | 1 - arch/h8300/Kconfig | 2 -- arch/h8300/include/asm/unistd.h | 1 - arch/hexagon/Kconfig | 2 -- arch/hexagon/include/uapi/asm/unistd.h | 1 - arch/ia64/Kconfig | 2 -- arch/ia64/include/asm/unistd.h | 1 - arch/m32r/Kconfig | 2 -- arch/m32r/include/asm/unistd.h | 1 - arch/m68k/Kconfig | 2 -- arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/Kconfig | 2 -- arch/microblaze/include/asm/unistd.h | 1 - arch/mips/Kconfig | 2 -- arch/mips/include/asm/unistd.h | 1 - arch/mn10300/Kconfig | 2 -- arch/mn10300/include/asm/unistd.h | 1 - arch/openrisc/Kconfig | 2 -- arch/openrisc/include/uapi/asm/unistd.h | 1 - arch/parisc/Kconfig | 2 -- arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/Kconfig | 2 -- arch/powerpc/include/asm/unistd.h | 1 - arch/s390/Kconfig | 2 -- arch/s390/include/asm/unistd.h | 1 - arch/score/Kconfig | 2 -- arch/score/include/asm/unistd.h | 1 - arch/sh/Kconfig | 2 -- arch/sh/include/asm/unistd.h | 1 - arch/sparc/Kconfig | 2 -- arch/sparc/include/asm/unistd.h | 1 - arch/tile/Kconfig | 2 -- arch/tile/include/asm/unistd.h | 1 - arch/unicore32/Kconfig | 2 -- arch/unicore32/include/uapi/asm/unistd.h | 1 - arch/x86/Kconfig | 2 -- arch/x86/include/asm/unistd.h | 1 - arch/x86/um/Kconfig | 2 -- arch/xtensa/Kconfig | 2 -- arch/xtensa/include/asm/unistd.h | 1 - fs/exec.c | 21 --------------------- include/linux/binfmts.h | 4 ---- include/linux/sched.h | 2 -- include/linux/syscalls.h | 9 --------- init/main.c | 4 +++- kernel/fork.c | 2 -- kernel/kmod.c | 6 +++--- 63 files changed, 6 insertions(+), 131 deletions(-) (limited to 'include/linux/sched.h') diff --git a/arch/Kconfig b/arch/Kconfig index 8d698fb5ccc..0a8dd0585d0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -271,12 +271,6 @@ config ARCH_WANT_OLD_COMPAT_IPC select ARCH_WANT_COMPAT_IPC_PARSE_VERSION bool -config GENERIC_KERNEL_THREAD - bool - -config GENERIC_KERNEL_EXECVE - bool - config HAVE_ARCH_SECCOMP_FILTER bool help diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5dd7f5db24d..7e3710c0cce 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -20,8 +20,6 @@ config ALPHA select GENERIC_CMOS_UPDATE select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA help diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index eb3a4664ced..68c75f2fadb 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -481,7 +481,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8918a2dd89b..b789654e7e2 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -11,8 +11,6 @@ config ARM select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select GENERIC_PCI_IOMAP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 7cd13cc6262..21a2700d295 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -41,7 +41,6 @@ #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_SOCKETCALL #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4b03c56ec32..a846029bebc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -7,8 +7,6 @@ config ARM64 select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_EXECVE - select GENERIC_KERNEL_THREAD select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d69aeea6da1..738322945d1 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -27,6 +27,5 @@ #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 649aeb9acec..06e73bf665e 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -17,8 +17,6 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h index f05a9804e8e..0bdf6371574 100644 --- a/arch/avr32/include/asm/unistd.h +++ b/arch/avr32/include/asm/unistd.h @@ -39,7 +39,6 @@ #define __ARCH_WANT_SYS_GETPGRP #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index ab9ff4075f4..b6f3ad5441c 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -45,8 +45,6 @@ config BLACKFIN select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config GENERIC_CSUM def_bool y diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index 460514a1a4e..38711e28baa 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -446,7 +446,6 @@ #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_VFORK /* diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 66eab3703c7..f6a3648f5ec 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -17,8 +17,6 @@ config C6X select OF select OF_EARLY_FLATTREE select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config MMU diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h index f3987a8703d..e7d09a614d1 100644 --- a/arch/c6x/include/uapi/asm/unistd.h +++ b/arch/c6x/include/uapi/asm/unistd.h @@ -14,7 +14,6 @@ * more details. */ -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE /* Use the standard ABI for syscalls. */ diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 0cac6a49f23..c59a01dd9c0 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -49,8 +49,6 @@ config CRIS select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 select GENERIC_CMOS_UPDATE select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS2 config HZ diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index f27b542e0eb..5cda75a9cc1 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -371,7 +371,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index df2eb4bd9fa..9d262645f66 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -12,8 +12,6 @@ config FRV select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES select ARCH_WANT_IPC_PARSE_VERSION - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config ZONE_DMA bool diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 1807d8ea8cb..d685da17f5f 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 04bef4d25b4..98fabd10e95 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -8,8 +8,6 @@ config H8300 select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SYMBOL_PREFIX string diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h index c2c2f5c7d6b..566f94860c4 100644 --- a/arch/h8300/include/asm/unistd.h +++ b/arch/h8300/include/asm/unistd.h @@ -356,7 +356,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index e418803b6c8..0744f7d7b1f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,8 +31,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE ---help--- Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index 2af81533bd0..4a87cc47075 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -27,7 +27,6 @@ */ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 67060046812..3279646120e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -42,8 +42,6 @@ config IA64 select GENERIC_TIME_VSYSCALL_OLD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index 1574bca8613..8b3ff2f5b86 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 5183f43a2cf..f807721e19a 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -15,8 +15,6 @@ config M32R select GENERIC_ATOMIC64 select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SBUS bool diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h index d9e7351af2a..cbfa39158fe 100644 --- a/arch/m32r/include/asm/unistd.h +++ b/arch/m32r/include/asm/unistd.h @@ -352,7 +352,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 953a7ba5d05..6710084e072 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -15,8 +15,6 @@ config M68K select FPU if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index a021d67cdd7..847994ce680 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -31,7 +31,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 4bcf89148f3..ba3b7c8c04b 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -26,8 +26,6 @@ config MICROBLAZE select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config SWAP diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 94d978986b7..38cabf4db54 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -422,7 +422,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK #ifdef CONFIG_MMU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4183e62f178..dba9390d37c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -40,8 +40,6 @@ config MIPS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA if 64BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE menu "Machine selection" diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index b306e2081ca..9e47cc11aa2 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -20,7 +20,6 @@ #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 72471744a91..aa03f2e1338 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -8,8 +8,6 @@ config MN10300 select HAVE_ARCH_KGDB select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config AM33_2 diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index cabf8ba73b2..e6d2ed4ba68 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -43,7 +43,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e7f1a2993f7..05f2ba41ff1 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -22,8 +22,6 @@ config OPENRISC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config MMU def_bool y diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 5082b806632..ce40b71df00 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -20,7 +20,6 @@ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index e688a2be30f..b77feffbade 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -22,8 +22,6 @@ config PARISC select GENERIC_STRNCPY_FROM_USER select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS help diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 1efef41659c..3043194547c 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -163,7 +163,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 951a517a1a0..17903f1f356 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,10 +141,8 @@ config PPC select GENERIC_CLOCKEVENTS select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config EARLY_PRINTK diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 76fe846ec40..784872f9371 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -55,7 +55,6 @@ #define __ARCH_WANT_SYS_NEWFSTATAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3cbb8757704..5029ebf7110 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -137,8 +137,6 @@ config S390 select GENERIC_CLOCKEVENTS select KTIME_SCALAR if 32BIT select HAVE_ARCH_SECCOMP_FILTER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 086bb8eaf6a..63653087251 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -53,7 +53,6 @@ # define __ARCH_WANT_COMPAT_SYS_TIME # define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND # endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 45893390c7d..3b1482e7afa 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -13,8 +13,6 @@ config SCORE select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS choice diff --git a/arch/score/include/asm/unistd.h b/arch/score/include/asm/unistd.h index 56001c93095..9cb4260a5f3 100644 --- a/arch/score/include/asm/unistd.h +++ b/arch/score/include/asm/unistd.h @@ -4,7 +4,6 @@ #define __ARCH_WANT_SYSCALL_NO_FLAGS #define __ARCH_WANT_SYSCALL_OFF_T #define __ARCH_WANT_SYSCALL_DEPRECATED -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 8451317eed5..babc2b826c5 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -40,8 +40,6 @@ config SUPERH select GENERIC_STRNLEN_USER select HAVE_MOD_ARCH_SPECIFIC if DWARF_UNWINDER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index 43d3f26b2ea..012004ed333 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -28,7 +28,6 @@ # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK # define __ARCH_WANT_SYS_RT_SIGACTION -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 0c7d365fa40..9f2edb5c555 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -41,8 +41,6 @@ config SPARC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index c3e5d8b6417..0ecea6ed943 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -46,7 +46,6 @@ #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE /* * "Conditional" syscalls diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index ea7f61e8bc9..875d008828b 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -21,8 +21,6 @@ config TILE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h index b51c6ee3cd6..940831fe9e9 100644 --- a/arch/tile/include/asm/unistd.h +++ b/arch/tile/include/asm/unistd.h @@ -16,6 +16,5 @@ #define __ARCH_WANT_SYS_LLSEEK #endif #define __ARCH_WANT_SYS_NEWFSTATAT -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index c4fbb21e802..60651df5f95 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -16,8 +16,6 @@ config UNICORE32 select ARCH_WANT_FRAME_POINTERS select GENERIC_IOMAP select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h index 00cf5e286fc..d4cc4559d84 100644 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ b/arch/unicore32/include/uapi/asm/unistd.h @@ -12,5 +12,4 @@ /* Use the standard ABI for syscalls. */ #include -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0df6e7d8453..01ca0ebaff0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,8 +108,6 @@ config X86 select GENERIC_STRNLEN_USER select HAVE_RCU_USER_QS if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 0e7dea7d366..9dcfcc1d6f9 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -50,7 +50,6 @@ # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_SYS_WAITPID -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 8f51c39750d..0fd20f241e4 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -13,8 +13,6 @@ endmenu config UML_X86 def_bool y select GENERIC_FIND_FIRST_BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config 64BIT bool "64-bit kernel" if SUBARCH = "x86" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2481f267be2..03a8c107e07 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -13,8 +13,6 @@ config XTENSA select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA select GENERIC_PCI_IOMAP - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select ARCH_WANT_OPTIONAL_GPIOLIB select CLONE_BACKWARDS help diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index e002dbcc88b..eb63ea87815 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -1,7 +1,6 @@ #ifndef _XTENSA_UNISTD_H #define _XTENSA_UNISTD_H -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/fs/exec.c b/fs/exec.c index 721a2992951..090ac91da2e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1657,7 +1657,6 @@ int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -#ifdef __ARCH_WANT_SYS_EXECVE SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, @@ -1685,23 +1684,3 @@ asmlinkage long compat_sys_execve(const char __user * filename, return error; } #endif -#endif - -#ifdef __ARCH_WANT_KERNEL_EXECVE -int kernel_execve(const char *filename, - const char *const argv[], - const char *const envp[]) -{ - int ret = do_execve(filename, - (const char __user *const __user *)argv, - (const char __user *const __user *)envp); - if (ret < 0) - return ret; - - /* - * We were successful. We won't be returning to our caller, but - * instead to user space by manipulating the kernel stack. - */ - ret_from_kernel_execve(current_pt_regs()); -} -#endif diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 2630c9b41a8..8c1388c6ae2 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -121,8 +121,4 @@ extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); -#ifdef __ARCH_WANT_KERNEL_EXECVE -extern void ret_from_kernel_execve(struct pt_regs *normal) __noreturn; -#endif - #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1162258bcaf..9e5a54e3d84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,7 @@ extern int do_execve(const char *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); -#ifdef CONFIG_GENERIC_KERNEL_THREAD extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -#endif extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 91835e7f364..9fe5f946526 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,6 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, const char __user *pathname); asmlinkage long sys_syncfs(int fd); -#ifndef CONFIG_GENERIC_KERNEL_EXECVE -int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]); -#else -#define kernel_execve(filename, argv, envp) \ - do_execve(filename, \ - (const char __user *const __user *)argv, \ - (const char __user *const __user *)envp) -#endif - asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS diff --git a/init/main.c b/init/main.c index e33e09df3cb..155ac208d58 100644 --- a/init/main.c +++ b/init/main.c @@ -797,7 +797,9 @@ static void __init do_pre_smp_initcalls(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return kernel_execve(init_filename, argv_init, envp_init); + return do_execve(init_filename, + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); } static void __init kernel_init_freeable(void); diff --git a/kernel/fork.c b/kernel/fork.c index 54073078343..389712ffc0a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1623,7 +1623,6 @@ long do_fork(unsigned long clone_flags, return nr; } -#ifdef CONFIG_GENERIC_KERNEL_THREAD /* * Create a kernel thread. */ @@ -1632,7 +1631,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } -#endif #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e38683..0023a87e8de 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); if (!retval) return 0; -- cgit v1.2.3-70-g09d2