From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:52 -0500 Subject: sched: add sched_class->needs_post_schedule() member We currently run class->post_schedule() outside of the rq->lock, which means that we need to test for the need to post_schedule outside of the lock to avoid a forced reacquistion. This is currently not a problem as we only look at rq->rt.overloaded. However, we want to enhance this going forward to look at more state to reduce the need to post_schedule to a bare minimum set. Therefore, we introduce a new member-func called needs_post_schedule() which tests for the post_schedule condtion without actually performing the work. Therefore it is safe to call this function before the rq->lock is released, because we are guaranteed not to drop the lock at an intermediate point (such as what post_schedule() may do). We will use this later in the series [ rostedt: removed paranoid BUG_ON ] Signed-off-by: Gregory Haskins --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e5f928a079e..836a86c32a6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1012,6 +1012,7 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); + int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); -- cgit v1.2.3-70-g09d2 From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 29 Dec 2008 09:39:53 -0500 Subject: sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins Acked-by: Steven Rostedt --- include/linux/init_task.h | 1 + include/linux/sched.h | 1 + kernel/sched.c | 4 ++ kernel/sched_rt.c | 119 +++++++++++++++++++++++++++++++++++++++------- 4 files changed, 107 insertions(+), 18 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e..6851225f44a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -140,6 +140,7 @@ extern struct group_info init_groups; .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \ .real_parent = &tsk, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 836a86c32a6..440cabb2d43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1179,6 +1179,7 @@ struct task_struct { #endif struct list_head tasks; + struct plist_node pushable_tasks; struct mm_struct *mm, *active_mm; diff --git a/kernel/sched.c b/kernel/sched.c index 3acbad8991a..24ab80c2876 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -471,6 +471,7 @@ struct rt_rq { #ifdef CONFIG_SMP unsigned long rt_nr_migratory; int overloaded; + struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; @@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif + plist_node_init(&p->pushable_tasks, MAX_PRIO); + put_cpu(); } @@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; + plist_head_init(&rq->rt.pushable_tasks, &rq->lock); #endif rt_rq->rt_time = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index b0b6ea4ed67..fe9da6084c8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq) rq->rt.overloaded = 0; } } + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +#else + +#define enqueue_pushable_task(rq, p) do { } while (0) +#define dequeue_pushable_task(rq, p) do { } while (0) + #endif /* CONFIG_SMP */ static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) @@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) enqueue_rt_entity(rt_se); + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); + inc_cpu_load(rq, p->se.load.weight); } @@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) update_curr_rt(rq); dequeue_rt_entity(rt_se); + dequeue_pushable_task(rq, p); + dec_cpu_load(rq, p->se.load.weight); } @@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, return next; } -static struct task_struct *pick_next_task_rt(struct rq *rq) +static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; @@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) p = rt_task_of(rt_se); p->se.exec_start = rq->clock; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + return p; } @@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); p->se.exec_start = 0; + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ + if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); } #ifdef CONFIG_SMP @@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->rt.nr_cpus_allowed <= 1); + + BUG_ON(!p->se.on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task @@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; - int ret = 0; int paranoid = RT_MAX_TRIES; if (!rq->rt.overloaded) return 0; - next_task = pick_next_highest_task_rt(rq, -1); + next_task = pick_next_pushable_task(rq); if (!next_task) return 0; @@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq) * so it is possible that next_task has changed. * If it has, then try again. */ - task = pick_next_highest_task_rt(rq, -1); + task = pick_next_pushable_task(rq); if (unlikely(task != next_task) && task && paranoid--) { put_task_struct(next_task); next_task = task; goto retry; } + + /* + * Once we have failed to push this task, we will not + * try again, since the other cpus will pull from us + * when they are ready + */ + dequeue_pushable_task(rq, next_task); goto out; } @@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq) double_unlock_balance(rq, lowest_rq); - ret = 1; out: put_task_struct(next_task); - return ret; + return 1; } -/* - * TODO: Currently we just use the second highest prio task on - * the queue, and stop when it can't migrate (or there's - * no more RT tasks). There may be a case where a lower - * priority RT task has a different affinity than the - * higher RT task. In this case the lower RT task could - * possibly be able to migrate where as the higher priority - * RT task could not. We currently ignore this issue. - * Enhancements are welcome! - */ static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ @@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) */ static int needs_post_schedule_rt(struct rq *rq) { - return rq->rt.overloaded ? 1 : 0; + return has_pushable_tasks(rq); } static void post_schedule_rt(struct rq *rq) @@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && - rq->rt.overloaded && + has_pushable_tasks(rq) && p->rt.nr_cpus_allowed > 1) push_rt_tasks(rq); } @@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p, if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); + if (!task_current(rq, p)) { + /* + * Make sure we dequeue this task from the pushable list + * before going further. It will either remain off of + * the list because we are no longer pushable, or it + * will be requeued. + */ + if (p->rt.nr_cpus_allowed > 1) + dequeue_pushable_task(rq, p); + + /* + * Requeue if our weight is changing and still > 1 + */ + if (weight > 1) + enqueue_pushable_task(rq, p); + + } + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { rq->rt.rt_nr_migratory++; } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { @@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq) struct task_struct *p = rq->curr; p->se.exec_start = rq->clock; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); } static const struct sched_class rt_sched_class = { -- cgit v1.2.3-70-g09d2 From 490dea45d00f01847ebebd007685d564aaf2cd98 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 24 Nov 2008 17:06:57 +0100 Subject: itimers: remove the per-cpu-ish-ness Either we bounce once cacheline per cpu per tick, yielding n^2 bounces or we just bounce a single.. Also, using per-cpu allocations for the thread-groups complicates the per-cpu allocator in that its currently aimed to be a fixed sized allocator and the only possible extention to that would be vmap based, which is seriously constrained on 32 bit archs. So making the per-cpu memory requirement depend on the number of processes is an issue. Lastly, it didn't deal with cpu-hotplug, although admittedly that might be fixable. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 6 ++++ include/linux/sched.h | 29 ++++++++++++-------- kernel/fork.c | 15 +++++----- kernel/posix-cpu-timers.c | 70 ----------------------------------------------- kernel/sched_stats.h | 33 ++++++++++------------ 5 files changed, 46 insertions(+), 107 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 2f3c2d4ef73..ea0ea1a4c36 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -48,6 +48,12 @@ extern struct fs_struct init_fs; .posix_timers = LIST_HEAD_INIT(sig.posix_timers), \ .cpu_timers = INIT_CPU_TIMERS(sig.cpu_timers), \ .rlim = INIT_RLIMITS, \ + .cputime = { .totals = { \ + .utime = cputime_zero, \ + .stime = cputime_zero, \ + .sum_exec_runtime = 0, \ + .lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock), \ + }, }, \ } extern struct nsproxy init_nsproxy; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f..c20943eabb4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -450,6 +450,7 @@ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; + spinlock_t lock; }; /* Alternate field names when used to cache expirations. */ #define prof_exp stime @@ -465,7 +466,7 @@ struct task_cputime { * used for thread group CPU clock calculations. */ struct thread_group_cputime { - struct task_cputime *totals; + struct task_cputime totals; }; /* @@ -2180,24 +2181,30 @@ static inline int spin_needbreak(spinlock_t *lock) * Thread group CPU time accounting. */ -extern int thread_group_cputime_alloc(struct task_struct *); -extern void thread_group_cputime(struct task_struct *, struct task_cputime *); - -static inline void thread_group_cputime_init(struct signal_struct *sig) +static inline +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - sig->cputime.totals = NULL; + struct task_cputime *totals = &tsk->signal->cputime.totals; + unsigned long flags; + + spin_lock_irqsave(&totals->lock, flags); + *times = *totals; + spin_unlock_irqrestore(&totals->lock, flags); } -static inline int thread_group_cputime_clone_thread(struct task_struct *curr) +static inline void thread_group_cputime_init(struct signal_struct *sig) { - if (curr->signal->cputime.totals) - return 0; - return thread_group_cputime_alloc(curr); + sig->cputime.totals = (struct task_cputime){ + .utime = cputime_zero, + .stime = cputime_zero, + .sum_exec_runtime = 0, + }; + + spin_lock_init(&sig->cputime.totals.lock); } static inline void thread_group_cputime_free(struct signal_struct *sig) { - free_percpu(sig->cputime.totals); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 7b8f2a78be3..7087d8c0e5e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,14 +820,15 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) int ret; if (clone_flags & CLONE_THREAD) { - ret = thread_group_cputime_clone_thread(current); - if (likely(!ret)) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - } - return ret; + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + + if (sig) + posix_cpu_timers_init_group(sig); + tsk->signal = sig; if (!sig) return -ENOMEM; @@ -864,8 +865,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); tty_audit_fork(sig); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 157de3a4783..fa07da94d7b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -9,76 +9,6 @@ #include #include -/* - * Allocate the thread_group_cputime structure appropriately and fill in the - * current values of the fields. Called from copy_signal() via - * thread_group_cputime_clone_thread() when adding a second or subsequent - * thread to a thread group. Assumes interrupts are enabled when called. - */ -int thread_group_cputime_alloc(struct task_struct *tsk) -{ - struct signal_struct *sig = tsk->signal; - struct task_cputime *cputime; - - /* - * If we have multiple threads and we don't already have a - * per-CPU task_cputime struct (checked in the caller), allocate - * one and fill it in with the times accumulated so far. We may - * race with another thread so recheck after we pick up the sighand - * lock. - */ - cputime = alloc_percpu(struct task_cputime); - if (cputime == NULL) - return -ENOMEM; - spin_lock_irq(&tsk->sighand->siglock); - if (sig->cputime.totals) { - spin_unlock_irq(&tsk->sighand->siglock); - free_percpu(cputime); - return 0; - } - sig->cputime.totals = cputime; - cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); - cputime->utime = tsk->utime; - cputime->stime = tsk->stime; - cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; - spin_unlock_irq(&tsk->sighand->siglock); - return 0; -} - -/** - * thread_group_cputime - Sum the thread group time fields across all CPUs. - * - * @tsk: The task we use to identify the thread group. - * @times: task_cputime structure in which we return the summed fields. - * - * Walk the list of CPUs to sum the per-CPU time fields in the thread group - * time structure. - */ -void thread_group_cputime( - struct task_struct *tsk, - struct task_cputime *times) -{ - struct task_cputime *totals, *tot; - int i; - - totals = tsk->signal->cputime.totals; - if (!totals) { - times->utime = tsk->utime; - times->stime = tsk->stime; - times->sum_exec_runtime = tsk->se.sum_exec_runtime; - return; - } - - times->stime = times->utime = cputime_zero; - times->sum_exec_runtime = 0; - for_each_possible_cpu(i) { - tot = per_cpu_ptr(totals, i); - times->utime = cputime_add(times->utime, tot->utime); - times->stime = cputime_add(times->stime, tot->stime); - times->sum_exec_runtime += tot->sum_exec_runtime; - } -} - /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index f2773b5d122..8ab0cef8eca 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->utime = cputime_add(times->utime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->utime = cputime_add(times->utime, cputime); + spin_unlock(×->lock); } /** @@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { + struct task_cputime *times; struct signal_struct *sig; /* tsk == current, ensure it is safe to use ->signal */ @@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk, return; sig = tsk->signal; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->stime = cputime_add(times->stime, cputime); - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->stime = cputime_add(times->stime, cputime); + spin_unlock(×->lock); } /** @@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { + struct task_cputime *times; struct signal_struct *sig; sig = tsk->signal; @@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - if (sig->cputime.totals) { - struct task_cputime *times; + times = &sig->cputime.totals; - times = per_cpu_ptr(sig->cputime.totals, get_cpu()); - times->sum_exec_runtime += ns; - put_cpu_no_resched(); - } + spin_lock(×->lock); + times->sum_exec_runtime += ns; + spin_unlock(×->lock); } -- cgit v1.2.3-70-g09d2 From baf48f6577e581a9adb8fe849dc80e24b21d171d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Mon, 12 Jan 2009 21:15:17 -0800 Subject: softlock: fix false panic which can occur if softlockup_thresh is reduced At run-time, if softlockup_thresh is changed to a much lower value, touch_timestamp is likely to be much older than the new softlock_thresh. This will cause a false softlockup to be detected. If softlockup_panic is enabled, the system will panic. The fix is to touch all watchdogs before changing softlockup_thresh. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/softlockup.c | 9 +++++++++ kernel/sysctl.c | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cae9b81a1f..54cbabf3b87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -293,6 +293,9 @@ extern void sched_show_task(struct task_struct *p); extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); +extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d9188c66278..85d5a245510 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void) } EXPORT_SYMBOL(touch_all_softlockup_watchdogs); +int proc_dosoftlockup_thresh(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + touch_all_softlockup_watchdogs(); + return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); +} + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 89d74436318..596dc31a711 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -800,7 +800,7 @@ static struct ctl_table kern_table[] = { .data = &softlockup_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &proc_dosoftlockup_thresh, .strategy = &sysctl_intvec, .extra1 = &neg_one, .extra2 = &sixty, -- cgit v1.2.3-70-g09d2 From 9df04e1f25effde823a600e755b51475d438f56b Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Thu, 29 Jan 2009 14:25:26 -0800 Subject: epoll: drop max_user_instances and rely only on max_user_watches Linus suggested to put limits where the money is, and max_user_watches already does that w/out the need of max_user_instances. That has the advantage to mitigate the potential DoS while allowing pretty generous default behavior. Allowing top 4% of low memory (per user) to be allocated in epoll watches, we have: LOMEM MAX_WATCHES (per user) 512MB ~178000 1GB ~356000 2GB ~712000 A box with 512MB of lomem, will meet some challenge in hitting 180K watches, socket buffers math teaches us. No more max_user_instances limits then. Signed-off-by: Davide Libenzi Cc: Willy Tarreau Cc: Michael Kerrisk Cc: Bron Gondwana Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 22 ++++------------------ include/linux/sched.h | 1 - 2 files changed, 4 insertions(+), 19 deletions(-) (limited to 'include/linux/sched.h') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index ba2f9ec7119..011b9b8c90c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -234,8 +234,6 @@ struct ep_pqueue { /* * Configuration options available inside /proc/sys/fs/epoll/ */ -/* Maximum number of epoll devices, per user */ -static int max_user_instances __read_mostly; /* Maximum number of epoll watched descriptors, per user */ static int max_user_watches __read_mostly; @@ -260,14 +258,6 @@ static struct kmem_cache *pwq_cache __read_mostly; static int zero; ctl_table epoll_table[] = { - { - .procname = "max_user_instances", - .data = &max_user_instances, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &zero, - }, { .procname = "max_user_watches", .data = &max_user_watches, @@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); - atomic_dec(&ep->user->epoll_devs); free_uid(ep->user); kfree(ep); } @@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep) struct eventpoll *ep; user = get_current_user(); - error = -EMFILE; - if (unlikely(atomic_read(&user->epoll_devs) >= - max_user_instances)) - goto free_uid; error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) @@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); - atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void) struct sysinfo si; si_meminfo(&si); - max_user_instances = 128; - max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + /* + * Allows top 4% of lomem to be allocated for epoll watches (per user). + */ + max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 02e16d20730..5a7c7638873 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -630,7 +630,6 @@ struct user_struct { atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif #ifdef CONFIG_EPOLL - atomic_t epoll_devs; /* The number of epoll descriptors currently open */ atomic_t epoll_watches; /* The number of file descriptors currently watched */ #endif #ifdef CONFIG_POSIX_MQUEUE -- cgit v1.2.3-70-g09d2 From 35626129abcd6a7547e84c817ef5b6eff7a8758b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Feb 2009 16:00:29 -0800 Subject: sched: add missing kernel-doc in sched.h Add kernel-doc notation for @lock: include/linux/sched.h:457: No description found for parameter 'lock' Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a7c7638873..2127e959e0f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -443,6 +443,7 @@ struct pacct_struct { * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds + * @lock: lock for fields in this struct * * This structure groups together three kinds of CPU time that are * tracked for threads and thread groups. Most things considering -- cgit v1.2.3-70-g09d2