From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/fork.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf8..d428336e7aa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; + unsigned long *stackend; + int err; prepare_to_copy(orig); @@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); -- cgit v1.2.3-70-g09d2 From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 16 Jan 2009 09:09:38 -0800 Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK Fixes the following compile error: kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count' kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp' Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd..fb944428283 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_DETECT_HUNG_TASK p->last_switch_count = 0; p->last_switch_timestamp = 0; #endif -- cgit v1.2.3-70-g09d2 From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++++- kernel/exit.c | 3 +++ kernel/fork.c | 3 ++- kernel/signal.c | 8 ++++---- 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2127e959e0f..2e0646a3031 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -559,7 +559,7 @@ struct signal_struct { * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ - cputime_t cutime, cstime; + cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; @@ -567,6 +567,14 @@ struct signal_struct { unsigned long inblock, oublock, cinblock, coublock; struct task_io_accounting ioac; + /* + * Cumulative ns of schedule CPU time fo dead threads in the + * group, not including a zombie group leader, (This only differs + * from jiffies_to_ns(utime + stime) if sched_clock uses something + * other than jiffies.) + */ + unsigned long long sum_sched_runtime; + /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3f187..efd30ccf385 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e772..e8e854a04ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/signal.c b/kernel/signal.c index b6b36768b75..2a74fe87c0d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; - struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - thread_group_cputime(tsk, &cputime); - info.si_utime = cputime_to_jiffies(cputime.utime); - info.si_stime = cputime_to_jiffies(cputime.stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) -- cgit v1.2.3-70-g09d2 From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 6 Feb 2009 15:37:47 -0800 Subject: softlockup: remove timestamp checking from hung_task Impact: saves sizeof(long) bytes per task_struct By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between tasklist scans we can avoid using timestamps. Signed-off-by: Mandeep Singh Baines Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/fork.c | 8 +++----- kernel/hung_task.c | 48 +++++++++--------------------------------------- 3 files changed, 12 insertions(+), 45 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a2811c6239..e0d723fea9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1241,7 +1241,6 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ - unsigned long last_switch_timestamp; unsigned long last_switch_count; #endif /* CPU-specific state of this task */ diff --git a/kernel/fork.c b/kernel/fork.c index fb944428283..bf582f75014 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif tsk->mm = NULL; tsk->active_mm = NULL; @@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_HUNG_TASK - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3951a80e7cb..0c924de58cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; -static unsigned long __read_mostly hung_task_poll_jiffies; unsigned long __read_mostly sysctl_hung_task_warnings = 10; @@ -69,33 +68,17 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(void) -{ - int this_cpu = raw_smp_processor_id(); - - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void check_hung_task(struct task_struct *t, unsigned long now, - unsigned long timeout) +static void check_hung_task(struct task_struct *t, unsigned long timeout) { unsigned long switch_count = t->nvcsw + t->nivcsw; if (t->flags & PF_FROZEN) return; - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { + if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; - t->last_switch_timestamp = now; return; } - if ((long)(now - t->last_switch_timestamp) < timeout) - return; if (!sysctl_hung_task_warnings) return; sysctl_hung_task_warnings--; @@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now, sched_show_task(t); __debug_show_held_locks(t); - t->last_switch_timestamp = now; touch_nmi_watchdog(); if (sysctl_hung_task_panic) @@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; int batch_count = HUNG_TASK_BATCHING; - unsigned long now = get_timestamp(); struct task_struct *g, *t; /* @@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now, timeout); + check_hung_task(t, timeout); } while_each_thread(g, t); unlock: rcu_read_unlock(); } -static void update_poll_jiffies(void) +static unsigned long timeout_jiffies(unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - if (sysctl_hung_task_timeout_secs == 0) - hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT; - else - hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2; + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; } /* @@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, if (ret || !write) goto out; - update_poll_jiffies(); - wake_up_process(watchdog_task); out: @@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, static int watchdog(void *dummy) { set_user_nice(current, 0); - update_poll_jiffies(); for ( ; ; ) { - unsigned long timeout; + unsigned long timeout = sysctl_hung_task_timeout_secs; - while (schedule_timeout_interruptible(hung_task_poll_jiffies)); + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; - /* - * Need to cache timeout here to avoid timeout being set - * to 0 via sysctl while inside check_hung_*_tasks(). - */ - timeout = sysctl_hung_task_timeout_secs; - if (timeout) - check_hung_uninterruptible_tasks(timeout); + check_hung_uninterruptible_tasks(timeout); } return 0; -- cgit v1.2.3-70-g09d2 From 06eb23b1ba39c61ee5d5faeb42a097635693e370 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 Feb 2009 02:02:33 +0100 Subject: ptrace, x86: fix the usage of ptrace_fork() I noticed by pure accident we have ptrace_fork() and friends. This was added by "x86, bts: add fork and exit handling", commit bf53de907dfdaac178c92d774aae7370d7b97d20. I can't test this, ds_request_bts() returns -EOPNOTSUPP, but I strongly believe this needs the fix. I think something like this program int main(void) { int pid = fork(); if (!pid) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); kill(getpid(), SIGSTOP); fork(); } else { struct ptrace_bts_config bts = { .flags = PTRACE_BTS_O_ALLOC, .size = 4 * 4096, }; wait(NULL); ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACEFORK); ptrace(PTRACE_BTS_CONFIG, pid, &bts, sizeof(bts)); ptrace(PTRACE_CONT, pid, NULL, NULL); sleep(1); } return 0; } should crash the kernel. If the task is traced by its natural parent ptrace_reparented() returns 0 but we should clear ->btsxxx anyway. Signed-off-by: Oleg Nesterov Acked-by: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 242a706e772..43c039d55e9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1093,7 +1093,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(ptrace_reparented(current))) + if (unlikely(current->ptrace)) ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ -- cgit v1.2.3-70-g09d2 From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Mar 2009 22:58:45 +0100 Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we re-use the old parent, we must also re-use ->parent_exec_id to make sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed task exits. Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place two different cases together. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Andrew Morton Cc: David Howells Cc: Serge E. Hallyn Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a66fbde2071..4854c2c4a82 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif clear_all_latency_tracing(p); - /* Our parent execution domain becomes current domain - These must match for thread signalling to apply */ - p->parent_exec_id = p->self_exec_id; - /* ok, now we should be set up.. */ p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); p->pdeath_signal = 0; @@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, set_task_cpu(p, smp_processor_id()); /* CLONE_PARENT re-uses the old parent */ - if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; - else + p->parent_exec_id = current->parent_exec_id; + } else { p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } spin_lock(¤t->sighand->siglock); -- cgit v1.2.3-70-g09d2 From 9489424454c93f4d225d7af47978f8c7e84bf4d4 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 Mar 2009 22:05:12 -0600 Subject: cpumask: use mm_cpumask() wrapper: kernel/fork.c Impact: futureproof Makes code futureproof against the impending change to mm->cpu_vm_mask. It's also a chance to use the new cpumask_ ops which take a pointer. Signed-off-by: Rusty Russell --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 6715ebc3761..47c15840a38 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -284,7 +284,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; - cpus_clear(mm->cpu_vm_mask); + cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; -- cgit v1.2.3-70-g09d2 From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:00:13 -0400 Subject: Take fs_struct handling to new file (fs/fs_struct.c) Pure code move; two new helper functions for nfsd and daemonize (unshare_fs_struct() and daemonize_fs_struct() resp.; for now - the same code as used to be in callers). unshare_fs_struct() exported (for nfsd, as copy_fs_struct()/exit_fs() used to be), copy_fs_struct() and exit_fs() don't need exports anymore. Signed-off-by: Al Viro --- fs/Makefile | 2 +- fs/fs_struct.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++ fs/internal.h | 6 ++ fs/namei.c | 7 --- fs/namespace.c | 68 ---------------------- fs/nfsd/nfssvc.c | 7 +-- include/linux/fs_struct.h | 2 + kernel/exit.c | 31 +--------- kernel/fork.c | 29 +--------- 9 files changed, 155 insertions(+), 138 deletions(-) create mode 100644 fs/fs_struct.c (limited to 'kernel/fork.c') diff --git a/fs/Makefile b/fs/Makefile index 6e82a307bcd..b5cd8e18dd9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ - stack.o + stack.o fs_struct.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o diff --git a/fs/fs_struct.c b/fs/fs_struct.c new file mode 100644 index 00000000000..36e0a123bbf --- /dev/null +++ b/fs/fs_struct.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include + +/* + * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_root(struct fs_struct *fs, struct path *path) +{ + struct path old_root; + + write_lock(&fs->lock); + old_root = fs->root; + fs->root = *path; + path_get(path); + write_unlock(&fs->lock); + if (old_root.dentry) + path_put(&old_root); +} + +/* + * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. + * It can block. + */ +void set_fs_pwd(struct fs_struct *fs, struct path *path) +{ + struct path old_pwd; + + write_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + write_unlock(&fs->lock); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +void chroot_fs_refs(struct path *old_root, struct path *new_root) +{ + struct task_struct *g, *p; + struct fs_struct *fs; + int count = 0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + fs = p->fs; + if (fs) { + write_lock(&fs->lock); + if (fs->root.dentry == old_root->dentry + && fs->root.mnt == old_root->mnt) { + path_get(new_root); + fs->root = *new_root; + count++; + } + if (fs->pwd.dentry == old_root->dentry + && fs->pwd.mnt == old_root->mnt) { + path_get(new_root); + fs->pwd = *new_root; + count++; + } + write_unlock(&fs->lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + while (count--) + path_put(old_root); +} + +void put_fs_struct(struct fs_struct *fs) +{ + /* No need to hold fs->lock if we are killing it */ + if (atomic_dec_and_test(&fs->count)) { + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); + } +} + +void exit_fs(struct task_struct *tsk) +{ + struct fs_struct * fs = tsk->fs; + + if (fs) { + task_lock(tsk); + tsk->fs = NULL; + task_unlock(tsk); + put_fs_struct(fs); + } +} + +struct fs_struct *copy_fs_struct(struct fs_struct *old) +{ + struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); + /* We don't need to lock fs - think why ;-) */ + if (fs) { + atomic_set(&fs->count, 1); + rwlock_init(&fs->lock); + fs->umask = old->umask; + read_lock(&old->lock); + fs->root = old->root; + path_get(&old->root); + fs->pwd = old->pwd; + path_get(&old->pwd); + read_unlock(&old->lock); + } + return fs; +} + +int unshare_fs_struct(void) +{ + struct fs_struct *fsp = copy_fs_struct(current->fs); + if (!fsp) + return -ENOMEM; + exit_fs(current); + current->fs = fsp; + return 0; +} +EXPORT_SYMBOL_GPL(unshare_fs_struct); + +/* to be mentioned only in INIT_TASK */ +struct fs_struct init_fs = { + .count = ATOMIC_INIT(1), + .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .umask = 0022, +}; + +void daemonize_fs_struct(void) +{ + struct fs_struct *fs; + + exit_fs(current); /* current->fs->count--; */ + fs = &init_fs; + current->fs = fs; + atomic_inc(&fs->count); +} diff --git a/fs/internal.h b/fs/internal.h index 53af885f173..477a105f8df 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct linux_binprm; +struct path; /* * block_dev.c @@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *); extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void __init mnt_init(void); + +/* + * fs_struct.c + */ +extern void chroot_fs_refs(struct path *, struct path *); diff --git a/fs/namei.c b/fs/namei.c index d040ce11785..4c65a646013 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2897,10 +2897,3 @@ EXPORT_SYMBOL(vfs_symlink); EXPORT_SYMBOL(vfs_unlink); EXPORT_SYMBOL(dentry_unhash); EXPORT_SYMBOL(generic_readlink); - -/* to be mentioned only in INIT_TASK */ -struct fs_struct init_fs = { - .count = ATOMIC_INIT(1), - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), - .umask = 0022, -}; diff --git a/fs/namespace.c b/fs/namespace.c index f7ec283ccfb..1e56303c718 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2092,74 +2092,6 @@ out1: return retval; } -/* - * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_root(struct fs_struct *fs, struct path *path) -{ - struct path old_root; - - write_lock(&fs->lock); - old_root = fs->root; - fs->root = *path; - path_get(path); - write_unlock(&fs->lock); - if (old_root.dentry) - path_put(&old_root); -} - -/* - * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. - * It can block. Requires the big lock held. - */ -void set_fs_pwd(struct fs_struct *fs, struct path *path) -{ - struct path old_pwd; - - write_lock(&fs->lock); - old_pwd = fs->pwd; - fs->pwd = *path; - path_get(path); - write_unlock(&fs->lock); - - if (old_pwd.dentry) - path_put(&old_pwd); -} - -static void chroot_fs_refs(struct path *old_root, struct path *new_root) -{ - struct task_struct *g, *p; - struct fs_struct *fs; - int count = 0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - fs = p->fs; - if (fs) { - write_lock(&fs->lock); - if (fs->root.dentry == old_root->dentry - && fs->root.mnt == old_root->mnt) { - path_get(new_root); - fs->root = *new_root; - count++; - } - if (fs->pwd.dentry == old_root->dentry - && fs->pwd.mnt == old_root->mnt) { - path_get(new_root); - fs->pwd = *new_root; - count++; - } - write_unlock(&fs->lock); - } - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - while (count--) - path_put(old_root); -} - /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 07e4f5d7baa..144d6991861 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -404,7 +404,6 @@ static int nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; - struct fs_struct *fsp; int err, preverr = 0; /* Lock module and set up kernel thread */ @@ -413,13 +412,11 @@ nfsd(void *vrqstp) /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ - fsp = copy_fs_struct(current->fs); - if (!fsp) { + if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; } - exit_fs(current); - current->fs = fsp; + current->fs->umask = 0; /* diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 18b467dbe27..298cef1c079 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void put_fs_struct(struct fs_struct *); +extern void daemonize_fs_struct(void); +extern int unshare_fs_struct(void); #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c..ad8375758a7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal); void daemonize(const char *name, ...) { va_list args; - struct fs_struct *fs; sigset_t blocked; va_start(args, name); @@ -462,11 +461,7 @@ void daemonize(const char *name, ...) /* Become as one with the init task */ - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - + daemonize_fs_struct(); exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk) } } -void put_fs_struct(struct fs_struct *fs) -{ - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { - path_put(&fs->root); - path_put(&fs->pwd); - kmem_cache_free(fs_cachep, fs); - } -} - -void exit_fs(struct task_struct *tsk) -{ - struct fs_struct * fs = tsk->fs; - - if (fs) { - task_lock(tsk); - tsk->fs = NULL; - task_unlock(tsk); - put_fs_struct(fs); - } -} - -EXPORT_SYMBOL_GPL(exit_fs); - #ifdef CONFIG_MM_OWNER /* * Task p is exiting and it owned mm, lets find a new owner for it diff --git a/kernel/fork.c b/kernel/fork.c index 47c15840a38..05c02dc586b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -681,38 +681,13 @@ fail_nomem: return retval; } -static struct fs_struct *__copy_fs_struct(struct fs_struct *old) -{ - struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); - /* We don't need to lock fs - think why ;-) */ - if (fs) { - atomic_set(&fs->count, 1); - rwlock_init(&fs->lock); - fs->umask = old->umask; - read_lock(&old->lock); - fs->root = old->root; - path_get(&old->root); - fs->pwd = old->pwd; - path_get(&old->pwd); - read_unlock(&old->lock); - } - return fs; -} - -struct fs_struct *copy_fs_struct(struct fs_struct *old) -{ - return __copy_fs_struct(old); -} - -EXPORT_SYMBOL_GPL(copy_fs_struct); - static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { if (clone_flags & CLONE_FS) { atomic_inc(¤t->fs->count); return 0; } - tsk->fs = __copy_fs_struct(current->fs); + tsk->fs = copy_fs_struct(current->fs); if (!tsk->fs) return -ENOMEM; return 0; @@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) if ((unshare_flags & CLONE_FS) && (fs && atomic_read(&fs->count) > 1)) { - *new_fsp = __copy_fs_struct(current->fs); + *new_fsp = copy_fs_struct(current->fs); if (!*new_fsp) return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 Mar 2009 07:20:30 -0400 Subject: New locking/refcounting for fs_struct * all changes of current->fs are done under task_lock and write_lock of old fs->lock * refcount is not atomic anymore (same protection) * its decrements are done when removing reference from current; at the same time we decide whether to free it. * put_fs_struct() is gone * new field - ->in_exec. Set by check_unsafe_exec() if we are trying to do execve() and only subthreads share fs_struct. Cleared when finishing exec (success and failure alike). Makes CLONE_FS fail with -EAGAIN if set. * check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread is in progress. Signed-off-by: Al Viro --- fs/compat.c | 16 +++++++++-- fs/exec.c | 31 +++++++++++++++++---- fs/fs_struct.c | 69 +++++++++++++++++++++++++++++++++-------------- fs/internal.h | 2 +- fs/proc/task_nommu.c | 2 +- include/linux/fs_struct.h | 8 +++--- kernel/fork.c | 37 ++++++++++++++++++------- 7 files changed, 121 insertions(+), 44 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/compat.c b/fs/compat.c index 55efdfebdf5..baabf203b84 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -1441,12 +1442,15 @@ int compat_do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1488,6 +1492,9 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1506,6 +1513,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/exec.c b/fs/exec.c index c5128fbc916..07a059664b7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1056,16 +1056,18 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold current->cred_exec_mutex to protect against * PTRACE_ATTACH */ -void check_unsafe_exec(struct linux_binprm *bprm) +int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned long flags; unsigned n_fs, n_sighand; + int res = 0; bprm->unsafe = tracehook_unsafe_exec(p); n_fs = 1; n_sighand = 1; + write_lock(&p->fs->lock); lock_task_sighand(p, &flags); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) @@ -1073,11 +1075,19 @@ void check_unsafe_exec(struct linux_binprm *bprm) n_sighand++; } - if (atomic_read(&p->fs->count) > n_fs || - atomic_read(&p->sighand->count) > n_sighand) + if (p->fs->users > n_fs || + atomic_read(&p->sighand->count) > n_sighand) { bprm->unsafe |= LSM_UNSAFE_SHARE; + } else { + if (p->fs->in_exec) + res = -EAGAIN; + p->fs->in_exec = 1; + } unlock_task_sighand(p, &flags); + write_unlock(&p->fs->lock); + + return res; } /* @@ -1296,12 +1306,15 @@ int do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + + retval = check_unsafe_exec(bprm); + if (retval) + goto out_unlock; file = open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) - goto out_unlock; + goto out_unmark; sched_exec(); @@ -1344,6 +1357,9 @@ int do_execve(char * filename, goto out; /* execve succeeded */ + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1362,6 +1378,11 @@ out_file: fput(bprm->file); } +out_unmark: + write_lock(¤t->fs->lock); + current->fs->in_exec = 0; + write_unlock(¤t->fs->lock); + out_unlock: current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 36e0a123bbf..41cff72b377 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -72,25 +72,27 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) path_put(old_root); } -void put_fs_struct(struct fs_struct *fs) +void free_fs_struct(struct fs_struct *fs) { - /* No need to hold fs->lock if we are killing it */ - if (atomic_dec_and_test(&fs->count)) { - path_put(&fs->root); - path_put(&fs->pwd); - kmem_cache_free(fs_cachep, fs); - } + path_put(&fs->root); + path_put(&fs->pwd); + kmem_cache_free(fs_cachep, fs); } void exit_fs(struct task_struct *tsk) { - struct fs_struct * fs = tsk->fs; + struct fs_struct *fs = tsk->fs; if (fs) { + int kill; task_lock(tsk); + write_lock(&fs->lock); tsk->fs = NULL; + kill = !--fs->users; + write_unlock(&fs->lock); task_unlock(tsk); - put_fs_struct(fs); + if (kill) + free_fs_struct(fs); } } @@ -99,7 +101,8 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { - atomic_set(&fs->count, 1); + fs->users = 1; + fs->in_exec = 0; rwlock_init(&fs->lock); fs->umask = old->umask; read_lock(&old->lock); @@ -114,28 +117,54 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) int unshare_fs_struct(void) { - struct fs_struct *fsp = copy_fs_struct(current->fs); - if (!fsp) + struct fs_struct *fs = current->fs; + struct fs_struct *new_fs = copy_fs_struct(fs); + int kill; + + if (!new_fs) return -ENOMEM; - exit_fs(current); - current->fs = fsp; + + task_lock(current); + write_lock(&fs->lock); + kill = !--fs->users; + current->fs = new_fs; + write_unlock(&fs->lock); + task_unlock(current); + + if (kill) + free_fs_struct(fs); + return 0; } EXPORT_SYMBOL_GPL(unshare_fs_struct); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { - .count = ATOMIC_INIT(1), + .users = 1, .lock = __RW_LOCK_UNLOCKED(init_fs.lock), .umask = 0022, }; void daemonize_fs_struct(void) { - struct fs_struct *fs; + struct fs_struct *fs = current->fs; + + if (fs) { + int kill; + + task_lock(current); - exit_fs(current); /* current->fs->count--; */ - fs = &init_fs; - current->fs = fs; - atomic_inc(&fs->count); + write_lock(&init_fs.lock); + init_fs.users++; + write_unlock(&init_fs.lock); + + write_lock(&fs->lock); + current->fs = &init_fs; + kill = !--fs->users; + write_unlock(&fs->lock); + + task_unlock(current); + if (kill) + free_fs_struct(fs); + } } diff --git a/fs/internal.h b/fs/internal.h index 477a105f8df..b4dac4fb6b6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -44,7 +44,7 @@ extern void __init chrdev_init(void); /* * exec.c */ -extern void check_unsafe_exec(struct linux_binprm *); +extern int check_unsafe_exec(struct linux_binprm *); /* * namespace.c diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 343ea1216bc..6ca01052c5b 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -49,7 +49,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) else bytes += kobjsize(mm); - if (current->fs && atomic_read(¤t->fs->count) > 1) + if (current->fs && current->fs->users > 1) sbytes += kobjsize(current->fs); else bytes += kobjsize(current->fs); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 298cef1c079..78a05bfcd8e 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -4,12 +4,10 @@ #include struct fs_struct { - atomic_t count; /* This usage count is used by check_unsafe_exec() for - * security checking purposes - therefore it may not be - * incremented, except by clone(CLONE_FS). - */ + int users; rwlock_t lock; int umask; + int in_exec; struct path root, pwd; }; @@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, struct path *); extern void set_fs_pwd(struct fs_struct *, struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); -extern void put_fs_struct(struct fs_struct *); +extern void free_fs_struct(struct fs_struct *); extern void daemonize_fs_struct(void); extern int unshare_fs_struct(void); diff --git a/kernel/fork.c b/kernel/fork.c index 05c02dc586b..51f138a131d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -683,11 +683,19 @@ fail_nomem: static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { + struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { - atomic_inc(¤t->fs->count); + /* tsk->fs is already what we want */ + write_lock(&fs->lock); + if (fs->in_exec) { + write_unlock(&fs->lock); + return -EAGAIN; + } + fs->users++; + write_unlock(&fs->lock); return 0; } - tsk->fs = copy_fs_struct(current->fs); + tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; @@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; - if ((unshare_flags & CLONE_FS) && - (fs && atomic_read(&fs->count) > 1)) { - *new_fsp = copy_fs_struct(current->fs); - if (!*new_fsp) - return -ENOMEM; - } + if (!(unshare_flags & CLONE_FS) || !fs) + return 0; + + /* don't need lock here; in the worst case we'll do useless copy */ + if (fs->users == 1) + return 0; + + *new_fsp = copy_fs_struct(fs); + if (!*new_fsp) + return -ENOMEM; return 0; } @@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (new_fs) { fs = current->fs; + write_lock(&fs->lock); current->fs = new_fs; - new_fs = fs; + if (--fs->users) + new_fs = NULL; + else + new_fs = fs; + write_unlock(&fs->lock); } if (new_mm) { @@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh: bad_unshare_cleanup_fs: if (new_fs) - put_fs_struct(new_fs); + free_fs_struct(new_fs); bad_unshare_cleanup_thread: bad_unshare_out: -- cgit v1.2.3-70-g09d2 From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:50:06 -0400 Subject: Get rid of indirect include of fs_struct.h Don't pull it in sched.h; very few files actually need it and those can include directly. sched.h itself only needs forward declaration of struct fs_struct; Signed-off-by: Al Viro --- arch/cris/kernel/process.c | 1 - fs/dcache.c | 1 + fs/exec.c | 1 + fs/fs_struct.c | 1 + fs/namei.c | 1 + fs/namespace.c | 1 + fs/open.c | 1 + fs/proc/base.c | 1 + fs/proc/task_nommu.c | 1 + include/linux/mnt_namespace.h | 2 ++ include/linux/nsproxy.h | 1 + include/linux/sched.h | 3 ++- init/do_mounts.c | 1 + kernel/auditsc.c | 1 + kernel/exec_domain.c | 1 + kernel/exit.c | 1 + kernel/fork.c | 1 + kernel/sys.c | 1 + security/tomoyo/realpath.c | 1 + 19 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 60816e87645..4df0b320d52 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/dcache.c b/fs/dcache.c index 90bbd7e1b11..0dc4de21f08 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/exec.c b/fs/exec.c index 614991bf0c8..052a961e41a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 6ac21933867..eee059052db 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -3,6 +3,7 @@ #include #include #include +#include /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. diff --git a/fs/namei.c b/fs/namei.c index 964c0249444..b8433ebfae0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) diff --git a/fs/namespace.c b/fs/namespace.c index 1e56303c718..c6f54e4c429 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "pnode.h" diff --git a/fs/open.c b/fs/open.c index 75b61677daa..377eb25b6ab 100644 --- a/fs/open.c +++ b/fs/open.c @@ -29,6 +29,7 @@ #include #include #include +#include int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { diff --git a/fs/proc/base.c b/fs/proc/base.c index e0afd326b68..f71559784bf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "internal.h" /* NOTE: diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 6ca01052c5b..253afc04484 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 830bbcd449d..3a059298cc1 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -22,6 +22,8 @@ struct proc_mounts { int event; }; +struct fs_struct; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void __put_mnt_ns(struct mnt_namespace *ns); diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index afad7dec1b3..7b370c7cfef 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -8,6 +8,7 @@ struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; +struct fs_struct; /* * A structure to contain pointers to all per-process diff --git a/include/linux/sched.h b/include/linux/sched.h index 29df6374d2d..b4e065ea0de 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -68,7 +68,7 @@ struct sched_param { #include #include #include -#include +#include #include #include #include @@ -97,6 +97,7 @@ struct futex_pi_state; struct robust_list_head; struct bio; struct bts_tracer; +struct fs_struct; /* * List of flags we want to share for kernel threads, diff --git a/init/do_mounts.c b/init/do_mounts.c index 8d4ff5afc1d..dd7ee5f203f 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8cbddff6c28..2bfc6478676 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "audit.h" diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index cb8e9626c21..c35452cadde 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -18,6 +18,7 @@ #include #include #include +#include static void default_handler(int, struct pt_regs *); diff --git a/kernel/exit.c b/kernel/exit.c index ad8375758a7..b5d656845c9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 51f138a131d..e82a14577a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sys.c b/kernel/sys.c index 37f458e6882..ce182aaed20 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index d47f16b844b..3bbe01a7a4b 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "common.h" #include "realpath.h" -- cgit v1.2.3-70-g09d2 From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 2 Apr 2009 16:56:32 -0700 Subject: nommu: fix a number of issues with the per-MM VMA patch Fix a number of issues with the per-MM VMA patch: (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on a NOMMU system with more than 2G pages. Makes no difference on a 32-bit system. (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value, lest it overflow. (3) Move the allocation of the vm_area_struct slab back for fork.c. (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs. (5) Use BUG_ON() rather than if () BUG(). (6) Make the default validate_nommu_regions() a static inline rather than a #define. (7) Make free_page_series()'s objection to pages with a refcount != 1 more informative. (8) Adjust the __put_nommu_region() banner comment to indicate that the semaphore must be held for writing. (9) Limit the number of warnings about munmaps of non-mmapped regions. Reported-by: Andrew Morton Signed-off-by: David Howells Cc: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- fs/proc/task_nommu.c | 4 ++-- include/linux/mm.h | 2 +- kernel/fork.c | 1 + mm/mmap.c | 3 --- mm/nommu.c | 52 +++++++++++++++++++++++++--------------------------- 6 files changed, 30 insertions(+), 34 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 43d23948384..74ea974f5ca 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeram-i.freehigh), #endif #ifndef CONFIG_MMU - K((unsigned long) atomic_read(&mmap_pages_allocated)), + K((unsigned long) atomic_long_read(&mmap_pages_allocated)), #endif K(i.totalswap), K(i.freeswap), diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 343ea1216bc..370be0a2c90 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } seq_printf(m, - "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", vma->vm_start, vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - vma->vm_pgoff << PAGE_SHIFT, + (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/include/linux/mm.h b/include/linux/mm.h index aeabe953ba4..bff1f0d475c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {} #endif /* nommu.c */ -extern atomic_t mmap_pages_allocated; +extern atomic_long_t mmap_pages_allocated; /* prio_tree.c */ void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); diff --git a/kernel/fork.c b/kernel/fork.c index 47c15840a38..51d1aa21483 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1488,6 +1488,7 @@ void __init proc_caches_init(void) mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); } diff --git a/mm/mmap.c b/mm/mmap.c index 1abb9185a68..4a3841186c1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); } diff --git a/mm/nommu.c b/mm/nommu.c index 2fcf47d449b..72eda4aee2c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ int heap_stack_gap = 0; -atomic_t mmap_pages_allocated; +atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(num_physpages); @@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { - vm_region_jar = kmem_cache_create("vm_region_jar", - sizeof(struct vm_region), 0, - SLAB_PANIC, NULL); - vm_area_cachep = kmem_cache_create("vm_area_struct", - sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } /* @@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(last->vm_end <= last->vm_start)) - BUG(); - if (unlikely(last->vm_top < last->vm_end)) - BUG(); + BUG_ON(unlikely(last->vm_end <= last->vm_start)); + BUG_ON(unlikely(last->vm_top < last->vm_end)); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - if (unlikely(region->vm_end <= region->vm_start)) - BUG(); - if (unlikely(region->vm_top < region->vm_end)) - BUG(); - if (unlikely(region->vm_start < last->vm_top)) - BUG(); + BUG_ON(unlikely(region->vm_end <= region->vm_start)); + BUG_ON(unlikely(region->vm_top < region->vm_end)); + BUG_ON(unlikely(region->vm_start < last->vm_top)); lastp = p; } } #else -#define validate_nommu_regions() do {} while(0) +static void validate_nommu_regions(void) +{ +} #endif /* @@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to) struct page *page = virt_to_page(from); kdebug("- free %lx", from); - atomic_dec(&mmap_pages_allocated); + atomic_long_dec(&mmap_pages_allocated); if (page_count(page) != 1) - kdebug("free page %p [%d]", page, page_count(page)); + kdebug("free page %p: refcount not one: %d", + page, page_count(page)); put_page(page); } } /* * release a reference to a region - * - the caller must hold the region semaphore, which this releases + * - the caller must hold the region semaphore for writing, which this releases * - the region may not have been added to the tree yet, in which case vm_top * will equal vm_start */ @@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma, goto enomem; total = 1 << order; - atomic_add(total, &mmap_pages_allocated); + atomic_long_add(total, &mmap_pages_allocated); point = rlen >> PAGE_SHIFT; @@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma, order = ilog2(total - point); n = 1 << order; kdebug("shave %lu/%lu @%lu", n, total - point, total); - atomic_sub(n, &mmap_pages_allocated); + atomic_long_sub(n, &mmap_pages_allocated); total -= n; set_page_refcounted(pages + total); __free_pages(pages + total, order); @@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d (%s):" - " 0x%lx-0x%lx\n", - current->pid, current->comm, start, start + len - 1); + static int limit = 0; + if (limit < 5) { + printk(KERN_WARNING + "munmap of memory not mmapped by process %d" + " (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 2 Apr 2009 16:56:59 -0700 Subject: Simplify copy_thread() First argument unused since 2.3.11. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/avr32/kernel/process.c | 2 +- arch/blackfin/kernel/process.c | 2 +- arch/cris/arch-v10/kernel/process.c | 2 +- arch/cris/arch-v32/kernel/process.c | 2 +- arch/frv/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/ia64/kernel/process.c | 2 +- arch/m32r/kernel/process.c | 2 +- arch/m68k/kernel/process.c | 2 +- arch/m68knommu/kernel/process.c | 2 +- arch/mips/kernel/process.c | 2 +- arch/mn10300/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/s390/kernel/process.c | 2 +- arch/sh/kernel/process_32.c | 2 +- arch/sh/kernel/process_64.c | 2 +- arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- arch/um/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/xtensa/kernel/process.c | 2 +- include/linux/sched.h | 3 ++- kernel/fork.c | 2 +- 27 files changed, 28 insertions(+), 27 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 8d0097f1020..3a2fb7a02db 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs) */ int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 2de14e2afdc..c3265a2e7cd 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task) asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int -copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start, +copy_thread(unsigned long clone_flags, unsigned long stack_start, unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs) { struct thread_info *thread = task_thread_info(p); diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 43ae555ecb3..1bbe1da5486 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 33e2e8993f7..f49427293ca 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs) } int -copy_thread(int nr, unsigned long clone_flags, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index bd9b3ff63f6..c4c69cf721e 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) */ asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index ced5b725d9b..120e7f796fe 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) extern asmlinkage void ret_from_fork(void); int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 9583a338e9d..0de50df7497 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk) /* * set up the kernel stack and exception frames for a new process */ -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index a8ef654a5a0..e2f33d0f996 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs) } -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index c5716270514..5d7c0e5b9e7 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task) * so there is nothing to worry about. */ int -copy_thread (int nr, unsigned long clone_flags, +copy_thread(unsigned long clone_flags, unsigned long user_stack_base, unsigned long user_stack_size, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 7103d91e1a2..3e876f0baeb 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) return 0; /* Task didn't use the fpu at all. */ } -int copy_thread(int nr, unsigned long clone_flags, unsigned long spu, +int copy_thread(unsigned long clone_flags, unsigned long spu, unsigned long unused, struct task_struct *tsk, struct pt_regs *regs) { struct pt_regs *childregs = task_pt_regs(tsk); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 632ce016014..ec37fb56c12 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) parent_tidptr, child_tidptr); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 3f2d7745f31..1e96c6eb631 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, NULL, NULL); } -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, struct task_struct * p, struct pt_regs * regs) { diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index ca2e4026ad2..1eaaa450e20 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -99,7 +99,7 @@ void flush_thread(void) { } -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { struct thread_info *ti = task_thread_info(p); diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index b28c9a60445..234cf344cdc 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk) * set up the kernel stack for a new thread and copy arch-specific thread * control information */ -int copy_thread(int nr, unsigned long clone_flags, +int copy_thread(unsigned long clone_flags, unsigned long c_usp, unsigned long ustk_size, struct task_struct *p, struct pt_regs *kregs) { diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index b80e02a4d81..8aa591ed912 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs) } int -copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, /* in ia64 this is "user_stack_size" */ struct task_struct * p, struct pt_regs * pregs) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index eac06494878..7b44a33f03c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk) /* * Copy a thread.. */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index b48e961a38f..a3acd8e60af 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task) { } -int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp, +int copy_thread(unsigned long clone_flags, unsigned long new_stackp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index ddafbbbab2a..694bc15f84f 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index c90c7e5e5fe..96be839040f 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) asmlinkage void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f4bee35a1b4..2830b415e21 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags, */ extern void ret_from_fork(void); -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index a73954b87f0..4041f94e772 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, * Parent --> %o0 == childs pid, %o1 == 0 * Child --> %o0 == parents pid, %o1 == 1 */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index a1c6d07cac3..4a28a1568d8 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -179,7 +179,7 @@ void fork_handler(void) userspace(¤t->thread.regs.regs); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long stack_top, struct task_struct * p, struct pt_regs *regs) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d766ca..76f8f84043a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a7f0c..b751a41392b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct *p, struct pt_regs *regs) { diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 9185597eb6a..031f3668571 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk) * childregs. */ -int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, +int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 481fad3a9b4..9186f8c5d5f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *); /* Allocate a new mm structure and copy contents from tsk->mm */ extern struct mm_struct *dup_mm(struct task_struct *tsk); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(unsigned long, unsigned long, unsigned long, + struct task_struct *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 51d1aa21483..d7eb727eb53 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_mm; if ((retval = copy_io(clone_flags, p))) goto bad_fork_cleanup_namespaces; - retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3-70-g09d2 From b3bfa0cba867f23365b81658b47efd906830879b Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Thu, 2 Apr 2009 16:58:08 -0700 Subject: signals: protect cinit from blocked fatal signals Normally SIG_DFL signals to global and container-init are dropped early. But if a signal is blocked when it is posted, we cannot drop the signal since the receiver may install a handler before unblocking the signal. Once this signal is queued however, the receiver container-init has no way of knowing if the signal was sent from an ancestor or descendant namespace. This patch ensures that contianer-init drops all SIG_DFL signals in get_signal_to_deliver() except SIGKILL/SIGSTOP. If SIGSTOP/SIGKILL originate from a descendant of container-init they are never queued (i.e dropped in sig_ignored() in an earler patch). If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued and container-init processes the signal. IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global or container-init, the signal must have been generated internally or must have come from an ancestor ns and we process the signal. Further, the signal_group_exit() check was needed to cover the case of a multi-threaded init sending SIGKILL to other threads when doing an exit() or exec(). But since the new sig_kernel_only() check covers the SIGKILL, the signal_group_exit() check is no longer needed and can be removed. Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for container-inits. Signed-off-by: Sukadev Bhattiprolu Cc: Oleg Nesterov Cc: Roland McGrath Cc: "Eric W. Biederman" Cc: Daniel Lezcano Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ kernel/signal.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d7eb727eb53..adbea16ec64 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -841,6 +841,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); sig->flags = 0; + if (clone_flags & CLONE_NEWPID) + sig->flags |= SIGNAL_UNKILLABLE; sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; diff --git a/kernel/signal.c b/kernel/signal.c index fb19aae2363..ba3da25f0ee 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1870,9 +1870,16 @@ relock: /* * Global init gets no signals it doesn't want. + * Container-init gets no signals it doesn't want from same + * container. + * + * Note that if global/container-init sees a sig_kernel_only() + * signal here, the signal must have been generated internally + * or must have come from an ancestor namespace. In either + * case, the signal cannot be dropped. */ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && - !signal_group_exit(signal)) + !sig_kernel_only(signr)) continue; if (sig_kernel_stop(signr)) { -- cgit v1.2.3-70-g09d2 From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 2 Apr 2009 16:58:39 -0700 Subject: pids: kill signal_struct-> __pgrp/__session and friends We are wasting 2 words in signal_struct without any reason to implement task_pgrp_nr() and task_session_nr(). task_session_nr() has no callers since 2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it. task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and fs/coda. This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills __pgrp/__session and the related helpers. The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense anyway. Signed-off-by: Oleg Nesterov Acked-by: Alan Cox [tty parts] Cc: Cedric Le Goater Cc: Dave Hansen Cc: Eric Biederman Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: Sukadev Bhattiprolu Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tty_io.c | 4 ++-- include/linux/sched.h | 43 ++++++------------------------------------- kernel/exit.c | 10 +++------- kernel/fork.c | 2 -- kernel/sys.c | 4 +--- 5 files changed, 12 insertions(+), 51 deletions(-) (limited to 'kernel/fork.c') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index a44b701c5bb..66b99a2049e 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty) /* Kill the entire session */ do_each_pid_task(session, PIDTYPE_SID, p) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): task_session_nr(p)==tty->session\n", + " (%s): task_session(p)==tty->session\n", task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); } while_each_pid_task(session, PIDTYPE_SID, p); @@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty) do_each_thread(g, p) { if (p->signal->tty == tty) { printk(KERN_NOTICE "SAK: killed process %d" - " (%s): task_session_nr(p)==tty->session\n", + " (%s): task_session(p)==tty->session\n", task_pid_nr(p), p->comm); send_sig(SIGKILL, p, 1); continue; diff --git a/include/linux/sched.h b/include/linux/sched.h index 49df878a0ca..206ac003e8c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,25 +547,8 @@ struct signal_struct { struct list_head cpu_timers[3]; - /* job control IDs */ - - /* - * pgrp and session fields are deprecated. - * use the task_session_Xnr and task_pgrp_Xnr routines below - */ - - union { - pid_t pgrp __deprecated; - pid_t __pgrp; - }; - struct pid *tty_old_pgrp; - union { - pid_t session __deprecated; - pid_t __session; - }; - /* boolean value for session group leader */ int leader; @@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p) return rt_prio(p->prio); } -static inline void set_task_session(struct task_struct *tsk, pid_t session) -{ - tsk->signal->__session = session; -} - -static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp) -{ - tsk->signal->__pgrp = pgrp; -} - static inline struct pid *task_pid(struct task_struct *task) { return task->pids[PIDTYPE_PID].pid; @@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk) } -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return tsk->signal->__pgrp; -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk) } -static inline pid_t task_session_nr(struct task_struct *tsk) -{ - return tsk->signal->__session; -} - static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +/* obsolete, do not use */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. diff --git a/kernel/exit.c b/kernel/exit.c index 384f09caf2e..3bec141c82f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void) void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - pid_t nr = pid_nr(pid); - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - set_task_session(curr, nr); - } - if (task_pgrp(curr) != pid) { + + if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); - set_task_pgrp(curr, nr); - } } static void set_special_pids(struct pid *pid) diff --git a/kernel/fork.c b/kernel/fork.c index adbea16ec64..f7445823144 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->signal->leader_pid = pid; tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); - set_task_pgrp(p, task_pgrp_nr(current)); - set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); list_add_tail_rcu(&p->tasks, &init_task.tasks); diff --git a/kernel/sys.c b/kernel/sys.c index 37f458e6882..742cefa527e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) if (err) goto out; - if (task_pgrp(p) != pgrp) { + if (task_pgrp(p) != pgrp) change_pid(p, PIDTYPE_PGID, pgrp); - set_task_pgrp(p, pid_nr(pgrp)); - } err = 0; out: -- cgit v1.2.3-70-g09d2