From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:02:48 +0200 Subject: perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Reviewed-by: Arjan van de Ven Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/exit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index ae5d8660ddf..e47ee8a0613 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA -- cgit v1.2.3-70-g09d2 From 02b51df1b07b4e9ca823c89284e704cadb323cd1 Mon Sep 17 00:00:00 2001 From: Scott James Remnant Date: Tue, 22 Sep 2009 16:43:44 -0700 Subject: proc connector: add event for process becoming session leader The act of a process becoming a session leader is a useful signal to a supervising init daemon such as Upstart. While a daemon will normally do this as part of the process of becoming a daemon, it is rare for its children to do so. When the children do, it is nearly always a sign that the child should be considered detached from the parent and not supervised along with it. The poster-child example is OpenSSH; the per-login children call setsid() so that they may control the pty connected to them. If the primary daemon dies or is restarted, we do not want to consider the per-login children and want to respawn the primary daemon without killing the children. This patch adds a new PROC_SID_EVENT and associated structure to the proc_event event_data union, it arranges for this to be emitted when the special PIDTYPE_SID pid is set. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Scott James Remnant Acked-by: Matt Helsley Cc: Oleg Nesterov Cc: Evgeniy Polyakov Acked-by: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/connector/cn_proc.c | 25 +++++++++++++++++++++++++ include/linux/cn_proc.h | 10 ++++++++++ kernel/exit.c | 4 +++- 3 files changed, 38 insertions(+), 1 deletion(-) (limited to 'kernel/exit.c') diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 85e5dc0431f..abf4a2529f8 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -139,6 +139,31 @@ void proc_id_connector(struct task_struct *task, int which_id) cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); } +void proc_sid_connector(struct task_struct *task) +{ + struct cn_msg *msg; + struct proc_event *ev; + struct timespec ts; + __u8 buffer[CN_PROC_MSG_SIZE]; + + if (atomic_read(&proc_event_num_listeners) < 1) + return; + + msg = (struct cn_msg *)buffer; + ev = (struct proc_event *)msg->data; + get_seq(&msg->seq, &ev->cpu); + ktime_get_ts(&ts); /* get high res monotonic timestamp */ + put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns); + ev->what = PROC_EVENT_SID; + ev->event_data.sid.process_pid = task->pid; + ev->event_data.sid.process_tgid = task->tgid; + + memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id)); + msg->ack = 0; /* not used */ + msg->len = sizeof(*ev); + cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL); +} + void proc_exit_connector(struct task_struct *task) { struct cn_msg *msg; diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h index b8125b2eb66..47dac5ea8d3 100644 --- a/include/linux/cn_proc.h +++ b/include/linux/cn_proc.h @@ -52,6 +52,7 @@ struct proc_event { PROC_EVENT_EXEC = 0x00000002, PROC_EVENT_UID = 0x00000004, PROC_EVENT_GID = 0x00000040, + PROC_EVENT_SID = 0x00000080, /* "next" should be 0x00000400 */ /* "last" is the last process event: exit */ PROC_EVENT_EXIT = 0x80000000 @@ -89,6 +90,11 @@ struct proc_event { } e; } id; + struct sid_proc_event { + __kernel_pid_t process_pid; + __kernel_pid_t process_tgid; + } sid; + struct exit_proc_event { __kernel_pid_t process_pid; __kernel_pid_t process_tgid; @@ -102,6 +108,7 @@ struct proc_event { void proc_fork_connector(struct task_struct *task); void proc_exec_connector(struct task_struct *task); void proc_id_connector(struct task_struct *task, int which_id); +void proc_sid_connector(struct task_struct *task); void proc_exit_connector(struct task_struct *task); #else static inline void proc_fork_connector(struct task_struct *task) @@ -114,6 +121,9 @@ static inline void proc_id_connector(struct task_struct *task, int which_id) {} +static inline void proc_sid_connector(struct task_struct *task) +{} + static inline void proc_exit_connector(struct task_struct *task) {} #endif /* CONFIG_PROC_EVENTS */ diff --git a/kernel/exit.c b/kernel/exit.c index e47ee8a0613..61bb1761c7b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); -- cgit v1.2.3-70-g09d2 From 1f10206cf8e945220f7220a809d8bfc15c21f9a5 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 22 Sep 2009 16:44:10 -0700 Subject: getrusage: fill ru_maxrss value Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater mark. This struct is filled as a parameter to getrusage syscall. ->ru_maxrss value is set to KBs which is the way it is done in BSD systems. /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs which seems to be incorrect behavior. Maintainer of this util was notified by me with the patch which corrects it and cc'ed. To make this happen we extend struct signal_struct by two fields. The first one is ->maxrss which we use to store rss hiwater of the task. The second one is ->cmaxrss which we use to store highest rss hiwater of all task childs. These values are used in k_getrusage() to actually fill ->ru_maxrss. k_getrusage() uses current rss hiwater value directly if mm struct exists. Note: exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss. it is intetionally behavior. *BSD getrusage have exec() inheriting. test programs ======================================================== getrusage.c =========== #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #define err(str) perror(str), exit(1) int main(int argc, char** argv) { int status; printf("allocate 100MB\n"); consume(100); printf("testcase1: fork inherit? \n"); printf(" expect: initial.self ~= child.self\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase2: fork inherit? (cont.) \n"); printf(" expect: initial.children ~= 100MB, but child.children = 0\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("child"); _exit(0); } printf("\n"); printf("testcase3: fork + malloc \n"); printf(" expect: child.self ~= initial.self + 50MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { printf("allocate +50MB\n"); consume(50); show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase4: grandchild maxrss\n"); printf(" expect: post_wait.children ~= 300MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); show_rusage("post_wait"); } else { system("./child -n 0 -g 300"); _exit(0); } printf("\n"); printf("testcase5: zombie\n"); printf(" expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n"); printf(" post_wait ~= 400MB, IOW wait() collect child's max_rss. \n"); show_rusage("initial"); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("pre_wait"); wait(&status); show_rusage("post_wait"); } else { system("./child -n 400"); _exit(0); } printf("\n"); printf("testcase6: SIG_IGN\n"); printf(" expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n"); show_rusage("initial"); signal(SIGCHLD, SIG_IGN); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("after_zombie"); } else { system("./child -n 500"); _exit(0); } printf("\n"); signal(SIGCHLD, SIG_DFL); printf("testcase7: exec (without fork) \n"); printf(" expect: initial ~= exec \n"); show_rusage("initial"); execl("./child", "child", "-v", NULL); return 0; } child.c ======= #include #include #include #include #include #include #include #include #include #include #include "common.h" int main(int argc, char** argv) { int status; int c; long consume_size = 0; long grandchild_consume_size = 0; int show = 0; while ((c = getopt(argc, argv, "n:g:v")) != -1) { switch (c) { case 'n': consume_size = atol(optarg); break; case 'v': show = 1; break; case 'g': grandchild_consume_size = atol(optarg); break; default: break; } } if (show) show_rusage("exec"); if (consume_size) { printf("child alloc %ldMB\n", consume_size); consume(consume_size); } if (grandchild_consume_size) { if (fork()) { wait(&status); } else { printf("grandchild alloc %ldMB\n", grandchild_consume_size); consume(grandchild_consume_size); exit(0); } } return 0; } common.c ======== #include #include #include #include #include #include #include #include #include #include #include #include "common.h" #define err(str) perror(str), exit(1) void show_rusage(char *prefix) { int err, err2; struct rusage rusage_self; struct rusage rusage_children; printf("%s: ", prefix); err = getrusage(RUSAGE_SELF, &rusage_self); if (!err) printf("self %ld ", rusage_self.ru_maxrss); err2 = getrusage(RUSAGE_CHILDREN, &rusage_children); if (!err2) printf("children %ld ", rusage_children.ru_maxrss); printf("\n"); } /* Some buggy OS need this worthless CPU waste. */ void make_pagefault(void) { void *addr; int size = getpagesize(); int i; for (i=0; i<1000; i++) { addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if (addr == MAP_FAILED) err("make_pagefault"); memset(addr, 0, size); munmap(addr, size); } } void consume(int mega) { size_t sz = mega * 1024 * 1024; void *ptr; ptr = malloc(sz); memset(ptr, 0, sz); make_pagefault(); } pid_t __fork(void) { pid_t pid; pid = fork(); make_pagefault(); return pid; } common.h ======== void show_rusage(char *prefix); void make_pagefault(void); void consume(int mega); pid_t __fork(void); FreeBSD result (expected result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 103492 children 0 fork child: self 103540 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 103540 children 103540 child: self 103564 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 103564 children 103564 allocate +50MB fork child: self 154860 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 103564 children 154860 grandchild alloc 300MB post_wait: self 103564 children 308720 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 103564 children 308720 child alloc 400MB pre_wait: self 103564 children 308720 post_wait: self 103564 children 411312 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 103564 children 411312 child alloc 500MB after_zombie: self 103624 children 411312 testcase7: exec (without fork) expect: initial ~= exec initial: self 103624 children 411312 exec: self 103624 children 411312 Linux result (actual test result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 102848 children 0 fork child: self 102572 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 102876 children 102644 child: self 102572 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 102876 children 102644 allocate +50MB fork child: self 153804 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 102876 children 153864 grandchild alloc 300MB post_wait: self 102876 children 307536 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 102876 children 307536 child alloc 400MB pre_wait: self 102876 children 307536 post_wait: self 102876 children 410076 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 102876 children 410076 child alloc 500MB after_zombie: self 102880 children 410076 testcase7: exec (without fork) expect: initial ~= exec initial: self 102880 children 410076 exec: self 102880 children 410076 Signed-off-by: Jiri Pirko Signed-off-by: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ include/linux/sched.h | 10 ++++++++++ kernel/exit.c | 6 ++++++ kernel/fork.c | 1 + kernel/sys.c | 14 ++++++++++++++ 5 files changed, 34 insertions(+) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 434dba778cc..69bb9d89979 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -845,6 +845,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + if (current->mm) + setmax_mm_hiwater_rss(&sig->maxrss, current->mm); + exit_itimers(sig); flush_itimer_signals(); diff --git a/include/linux/sched.h b/include/linux/sched.h index 97b10da0a3e..6448bbc6406 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -426,6 +426,15 @@ static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) return max(mm->hiwater_rss, get_mm_rss(mm)); } +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, + struct mm_struct *mm) +{ + unsigned long hiwater_rss = get_mm_hiwater_rss(mm); + + if (*maxrss < hiwater_rss) + *maxrss = hiwater_rss; +} + static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); @@ -612,6 +621,7 @@ struct signal_struct { unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; + unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* diff --git a/kernel/exit.c b/kernel/exit.c index 61bb1761c7b..60d6fdcc926 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -947,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -1210,6 +1212,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1258,6 +1261,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c index 1020977b57c..7cf45812ce8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -866,6 +866,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); diff --git a/kernel/sys.c b/kernel/sys.c index ea5c3bcac88..ebcb1561172 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) -- cgit v1.2.3-70-g09d2 From a7f0765edfd53aed09cb7b0e15863688b39447de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:44 -0700 Subject: ptrace: __ptrace_detach: do __wake_up_parent() if we reap the tracee The bug is old, it wasn't cause by recent changes. Test case: static void *tfunc(void *arg) { int pid = (long)arg; assert(ptrace(PTRACE_ATTACH, pid, NULL, NULL) == 0); kill(pid, SIGKILL); sleep(1); return NULL; } int main(void) { pthread_t th; long pid = fork(); if (!pid) pause(); signal(SIGCHLD, SIG_IGN); assert(pthread_create(&th, NULL, tfunc, (void*)pid) == 0); int r = waitpid(-1, NULL, __WNOTHREAD); printf("waitpid: %d %m\n", r); return 0; } Before the patch this program hangs, after this patch waitpid() correctly fails with errno == -ECHILD. The problem is, __ptrace_detach() reaps the EXIT_ZOMBIE tracee if its ->real_parent is our sub-thread and we ignore SIGCHLD. But in this case we should wake up other threads which can sleep in do_wait(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/exit.c | 5 +++++ kernel/ptrace.c | 11 +++++++---- kernel/signal.c | 9 --------- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/exit.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 848d1f20086..9e5a88afe6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2059,6 +2059,7 @@ extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern int kill_proc_info(int, struct siginfo *, pid_t); extern int do_notify_parent(struct task_struct *, int); +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); extern int send_sig(int, struct task_struct *, int); diff --git a/kernel/exit.c b/kernel/exit.c index 60d6fdcc926..782b2e1f7ca 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1575,6 +1575,11 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + wake_up_interruptible_sync(&parent->signal->wait_chldexit); +} + static long do_wait(struct wait_opts *wo) { DECLARE_WAITQUEUE(wait, current); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285af59..23bd09cd042 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/signal.c b/kernel/signal.c index 64c5deeaca5..534ea81cde4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1382,15 +1382,6 @@ ret: return ret; } -/* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. -- cgit v1.2.3-70-g09d2 From a2322e1d272938d192d8c24cdacf57c0c7a2683f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:45 -0700 Subject: do_wait() wakeup optimization: shift security_task_wait() from eligible_child() to wait_consider_task() Preparation, no functional changes. eligible_child() has a single caller, wait_consider_task(). We can move security_task_wait() out from eligible_child(), this allows us to use it for filtered wake_up(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 782b2e1f7ca..ef2dfa818bf 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1112,8 +1112,6 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) static int eligible_child(struct wait_opts *wo, struct task_struct *p) { - int err; - if (wo->wo_type < PIDTYPE_MAX) { if (task_pid_type(p, wo->wo_type) != wo->wo_pid) return 0; @@ -1128,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1492,6 +1486,7 @@ static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, -- cgit v1.2.3-70-g09d2 From 0b7570e77f7c3abd43107dabc47ea89daf9a1cba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:46 -0700 Subject: do_wait() wakeup optimization: change __wake_up_parent() to use filtered wakeup Ratan Nalumasu reported that in a process with many threads doing unnecessary wakeups. Every waiting thread in the process wakes up to loop through the children and see that the only ones it cares about are still not ready. Now that we have struct wait_opts we can change do_wait/__wake_up_parent to use filtered wakeups. We can make child_wait_callback() more clever later, right now it only checks eligible_child(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Acked-by: James Morris Tested-by: Valdis Kletnieks Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 25 +++++++++++++++++++++---- security/selinux/hooks.c | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index ef2dfa818bf..7838b4d6877 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1097,6 +1097,7 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; @@ -1570,20 +1571,35 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_child(wo, p)) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { - wake_up_interruptible_sync(&parent->signal->wait_chldexit); + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); } static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1624,7 +1640,8 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + if (wo->wo_info) { struct siginfo __user *infop = wo->wo_info; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 417f7c99452..bb230d5d708 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2411,7 +2411,7 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ read_lock(&tasklist_lock); - wake_up_interruptible(¤t->real_parent->signal->wait_chldexit); + __wake_up_parent(current, current->real_parent); read_unlock(&tasklist_lock); } -- cgit v1.2.3-70-g09d2 From b4fe51823d797d6959b2eee7868023e61606daa9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:47 -0700 Subject: do_wait() wakeup optimization: child_wait_callback: check __WNOTHREAD case Suggested by Roland. do_wait(__WNOTHREAD) can only succeed if the caller is either ptracer, or it is ->real_parent and the child is not traced. IOW, caller == p->parent otherwise we should not wake up. Change child_wait_callback() to check this. Ratan reports the workload with CPU load >99% caused by unnecessary wakeups, should be fixed by this patch. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 7838b4d6877..270a68b7f22 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1581,6 +1581,9 @@ static int child_wait_callback(wait_queue_t *wait, unsigned mode, if (!eligible_child(wo, p)) return 0; + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + return default_wake_function(wait, mode, sync, key); } -- cgit v1.2.3-70-g09d2 From 5c01ba49e6647d86bc7576105f82027200d1f303 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:48 -0700 Subject: do_wait-wakeup-optimization: fix child_wait_callback()->eligible_child() usage child_wait_callback()->eligible_child() is not right, we can miss the wakeup if the task was detached before __wake_up_parent() and the caller of do_wait() didn't use __WALL. Move ->wo_pid checks from eligible_child() to the new helper, eligible_pid(), and change child_wait_callback() to use it instead of eligible_child(). Note: actually I think it would be better to fix the __WCLONE check in eligible_child(), it doesn't look exactly right. But it is not clear what is the supposed behaviour, and any change is user-visible. Reported-by: KAMEZAWA Hiroyuki Tested-by: KAMEZAWA Hiroyuki Signed-off-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 270a68b7f22..3fb9a77863d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1111,13 +1111,16 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) return pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static inline int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1578,7 +1581,7 @@ static int child_wait_callback(wait_queue_t *wait, unsigned mode, child_wait); struct task_struct *p = key; - if (!eligible_child(wo, p)) + if (!eligible_pid(wo, p)) return 0; if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) -- cgit v1.2.3-70-g09d2 From 989264f4645c183331a1279d513f4b1ddc06e1f5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:49 -0700 Subject: do_wait-wakeup-optimization: simplify task_pid_type() task_pid_type() is only used by eligible_pid() which has to check wo_type != PIDTYPE_MAX anyway. Remove this check from task_pid_type() and factor out ->pids[type] access, this shrinks .text a bit and simplifies the code. The matches the behaviour of other similar helpers, say get_task_pid(). The caller must ensure that pid_type is valid, not the callee. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 3fb9a77863d..650c1d1a55d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1101,17 +1101,15 @@ struct wait_opts { int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static inline int eligible_pid(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; -- cgit v1.2.3-70-g09d2 From b6e763f07fba6243d2a553ed9a4f3e10a789932a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 23 Sep 2009 15:56:50 -0700 Subject: wait_consider_task: kill "parent" argument Kill the unused "parent" argument in wait_consider_task(), it was never used. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Ratan Nalumasu Cc: Vitaly Mayatskikh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 650c1d1a55d..1daa7f46bcc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1481,8 +1481,8 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) @@ -1550,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1564,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } -- cgit v1.2.3-70-g09d2 From dfe16dfa4ac178d9a10b489a73d535c6976e48d2 Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Wed, 23 Sep 2009 15:56:51 -0700 Subject: do_wait: fix sys_waitid()-specific behaviour do_wait() checks ->wo_info to figure out who is the caller. If it's not NULL the caller should be sys_waitid(), in that case do_wait() fixes up the retval or zeros ->wo_info, depending on retval from underlying function. This is bug: user can pass ->wo_info == NULL and sys_waitid() will return incorrect value. man 2 waitid says: waitid(): returns 0 on success Test-case: int main(void) { if (fork()) assert(waitid(P_ALL, 0, NULL, WEXITED) == 0); return 0; } Result: Assertion `waitid(P_ALL, 0, ((void *)0), 4) == 0' failed. Move that code to sys_waitid(). User-visible change: sys_waitid() will return 0 on success, either infop is set or not. Note, there's another bug in wait_noreap_copyout() which affects return value of sys_waitid(). It will be fixed in next patch. Signed-off-by: Vitaly Mayatskikh Reviewed-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 1daa7f46bcc..2cc69eb8db2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1645,32 +1645,6 @@ notask: end: __set_current_state(TASK_RUNNING); remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); - - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } return retval; } @@ -1715,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ -- cgit v1.2.3-70-g09d2 From b6fe2d117e98805ee76352e6468f87d494a97292 Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Wed, 23 Sep 2009 15:56:52 -0700 Subject: wait_noreap_copyout(): check for ->wo_info != NULL Current behaviour of sys_waitid() looks odd. If user passes infop == NULL, sys_waitid() returns success. When user additionally specifies flag WNOWAIT, sys_waitid() returns -EFAULT on the same conditions. When user combines WNOWAIT with WCONTINUED, sys_waitid() again returns success. This patch adds check for ->wo_info in wait_noreap_copyout(). User-visible change: starting from this commit, sys_waitid() always checks infop != NULL and does not fail if it is NULL. Signed-off-by: Vitaly Mayatskikh Reviewed-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel/exit.c') diff --git a/kernel/exit.c b/kernel/exit.c index 2cc69eb8db2..6c75ff83a8f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1140,18 +1140,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; -- cgit v1.2.3-70-g09d2 From 801460d0cf5c5288153b722565773059b0f44348 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 23 Sep 2009 15:57:41 -0700 Subject: task_struct cleanup: move binfmt field to mm_struct Because the binfmt is not different between threads in the same process, it can be moved from task_struct to mm_struct. And binfmt moudle is handled per mm_struct instead of task_struct. Signed-off-by: Hiroshi Shimamoto Acked-by: Oleg Nesterov Cc: Rusty Russell Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 10 ++++++---- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 1 - kernel/exit.c | 2 -- kernel/fork.c | 13 +++++++------ 5 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel/exit.c') diff --git a/fs/exec.c b/fs/exec.c index 6dc92c39dd9..d49be6bc179 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1397,10 +1397,12 @@ out_ret: void set_binfmt(struct linux_binfmt *new) { - if (current->binfmt) - module_put(current->binfmt->module); + struct mm_struct *mm = current->mm; + + if (mm->binfmt) + module_put(mm->binfmt->module); - current->binfmt = new; + mm->binfmt = new; if (new) __module_get(new->module); } @@ -1770,7 +1772,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) audit_core_dumps(signr); - binfmt = current->binfmt; + binfmt = mm->binfmt; if (!binfmt || !binfmt->core_dump) goto fail; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b7029ab9c8..21d6aa45206 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,6 +240,8 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + struct linux_binfmt *binfmt; + cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 811cd96524d..8a16f6d11dc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1271,7 +1271,6 @@ struct task_struct { struct mm_struct *mm, *active_mm; /* task state */ - struct linux_binfmt *binfmt; int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ diff --git a/kernel/exit.c b/kernel/exit.c index 6c75ff83a8f..5859f598c95 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -976,8 +976,6 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index e49f181ba1c..266c6af6ef1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -518,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -643,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -1037,9 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1327,9 +1331,6 @@ bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); -- cgit v1.2.3-70-g09d2