From 2b5fe6de58276d0b5a7c884d5dbfc300ca47db78 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:40:08 +0100 Subject: thread_group_cputime: move a couple of callsites outside of ->siglock Impact: relax the locking of cpu-time accounting calls ->siglock buys nothing for thread_group_cputime() in do_sys_times() and wait_task_zombie() (which btw takes the unrelated parent's ->siglock). Actually I think do_sys_times() doesn't need ->siglock at all. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fa..b9c4d8bb72e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1330,10 +1330,10 @@ static int wait_task_zombie(struct task_struct *p, int options, * group, which consolidates times for all threads in the * group including the group leader. */ + thread_group_cputime(p, &cputime); spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; - thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, cputime_add(cputime.utime, -- cgit v1.2.3-70-g09d2 From 7c0990c7ee988aa193abbb7da3faeb9279146dbf Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Wed, 19 Nov 2008 10:20:23 +0100 Subject: Do not free io context when taking recursive faults in do_exit When taking recursive faults in do_exit, if the io_context is not null, exit_io_context() is being called. But it might decrement the refcount more than once. It is better to leave this task alone. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index c7422ca9203..9a213474f54 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1037,8 +1037,6 @@ NORET_TYPE void do_exit(long code) * task into the wait for ever nirwana as well. */ tsk->flags |= PF_EXITPIDONE; - if (tsk->io_context) - exit_io_context(); set_current_state(TASK_UNINTERRUPTIBLE); schedule(); } -- cgit v1.2.3-70-g09d2 From e5991371ee0d1c0ce19e133c6f9075b49c5b4ae8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 6 Jan 2009 14:39:22 -0800 Subject: mm: remove cgroup_mm_owner_callbacks cgroup_mm_owner_callbacks() was brought in to support the memrlimit controller, but sneaked into mainline ahead of it. That controller has now been shelved, and the mm_owner_changed() args were inadequate for it anyway (they needed an mm pointer instead of a task pointer). Remove the dead code, and restore mm_update_next_owner() locking to how it was before: taking mmap_sem there does nothing for memcontrol.c, now the only user of mm->owner. Signed-off-by: Hugh Dickins Cc: Paul Menage Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 14 +------------- kernel/cgroup.c | 33 --------------------------------- kernel/exit.c | 16 ++++++---------- 3 files changed, 7 insertions(+), 56 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1164963c3a8..08b78c09b09 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -329,13 +329,7 @@ struct cgroup_subsys { struct cgroup *cgrp); void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); - /* - * This routine is called with the task_lock of mm->owner held - */ - void (*mm_owner_changed)(struct cgroup_subsys *ss, - struct cgroup *old, - struct cgroup *new, - struct task_struct *p); + int subsys_id; int active; int disabled; @@ -400,9 +394,6 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); -void cgroup_mm_owner_callbacks(struct task_struct *old, - struct task_struct *new); - #else /* !CONFIG_CGROUPS */ static inline int cgroup_init_early(void) { return 0; } @@ -420,9 +411,6 @@ static inline int cgroupstats_build(struct cgroupstats *stats, return -EINVAL; } -static inline void cgroup_mm_owner_callbacks(struct task_struct *old, - struct task_struct *new) {} - #endif /* !CONFIG_CGROUPS */ #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87bb0258fd2..f221446aa02 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -116,7 +116,6 @@ static int root_count; * be called. */ static int need_forkexit_callback __read_mostly; -static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -2539,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; - need_mm_owner_callback |= !!ss->mm_owner_changed; /* At system boot, before all subsystems have been * registered, no tasks have been forked, so we don't @@ -2789,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child) } } -#ifdef CONFIG_MM_OWNER -/** - * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes - * @p: the new owner - * - * Called on every change to mm->owner. mm_init_owner() does not - * invoke this routine, since it assigns the mm->owner the first time - * and does not change it. - * - * The callbacks are invoked with mmap_sem held in read mode. - */ -void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) -{ - struct cgroup *oldcgrp, *newcgrp = NULL; - - if (need_mm_owner_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - oldcgrp = task_cgroup(old, ss->subsys_id); - if (new) - newcgrp = task_cgroup(new, ss->subsys_id); - if (oldcgrp == newcgrp) - continue; - if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); - } - } -} -#endif /* CONFIG_MM_OWNER */ - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/exit.c b/kernel/exit.c index c9e5a1c14e0..f923724ab3c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -642,35 +642,31 @@ retry: /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or - * ptrace or page migration (get_task_mm()). Mark owner as NULL, - * so that subsystems can understand the callback and take action. + * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - down_write(&mm->mmap_sem); - cgroup_mm_owner_callbacks(mm->owner, NULL); mm->owner = NULL; - up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); - read_unlock(&tasklist_lock); - down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } - cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); - up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ -- cgit v1.2.3-70-g09d2 From 901608d9045146aec6f14a7777ea4b1501c379f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Jan 2009 14:40:29 -0800 Subject: mm: introduce get_mm_hiwater_xxx(), fix taskstats->hiwater_xxx accounting xacct_add_tsk() relies on do_exit()->update_hiwater_xxx() and uses mm->hiwater_xxx directly, this leads to 2 problems: - taskstats_user_cmd() can call fill_pid()->xacct_add_tsk() at any moment before the task exits, so we should check the current values of rss/vm anyway. - do_exit()->update_hiwater_xxx() calls are racy. An exiting thread can be preempted right before mm->hiwater_xxx = new_val, and another thread can use A_LOT of memory and exit in between. When the first thread resumes it can be the last thread in the thread group, in that case we report the wrong hiwater_xxx values which do not take A_LOT into account. Introduce get_mm_hiwater_rss() and get_mm_hiwater_vm() helpers and change xacct_add_tsk() to use them. The first helper will also be used by rusage->ru_maxrss accounting. Kill do_exit()->update_hiwater_xxx() calls. Unless we are going to decrease rss/vm there is no point to update mm->hiwater_xxx, and nobody can look at this mm_struct when exit_mmap() actually unmaps the memory. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 +++ kernel/exit.c | 5 +---- kernel/tsacct.c | 4 ++-- mm/mmap.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 38a3f4b1539..ea415136ac9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -386,6 +386,9 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) +#define get_mm_hiwater_rss(mm) max((mm)->hiwater_rss, get_mm_rss(mm)) +#define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm) + extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/kernel/exit.c b/kernel/exit.c index f923724ab3c..c7740fa3252 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1051,10 +1051,7 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - if (tsk->mm) { - update_hiwater_rss(tsk->mm); - update_hiwater_vm(tsk->mm); - } + group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 2dc06ab3571..43f891b05a4 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ - stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; - stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; mmput(mm); } stats->read_char = p->ioac.rchar; diff --git a/mm/mmap.c b/mm/mmap.c index e4507b23e62..1f97d8aa9b0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2102,7 +2102,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); - /* Don't update_hiwater_rss(mm) here, do_exit already did */ + /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); -- cgit v1.2.3-70-g09d2