From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set cgroup core has a bug which violates a basic rule about event notifications - when a new entity needs to be added, you add that to the notification list first and then make the new entity conform to the current state. If done in the reverse order, an event happening inbetween will be lost. cgroup_subsys->fork() is invoked way before the new task is added to the css_set. Currently, cgroup_freezer is the only user of ->fork() and uses it to make new tasks conform to the current state of the freezer. If FROZEN state is requested while fork is in progress between cgroup_fork_callbacks() and cgroup_post_fork(), the child could escape freezing - the cgroup isn't frozen when ->fork() is called and the freezer couldn't see the new task on the css_set. This patch moves cgroup_subsys->fork() invocation to cgroup_post_fork() after the new task is added to the css_set. cgroup_fork_callbacks() is removed. Because now a task may be migrated during cgroup_subsys->fork(), freezer_fork() is updated so that it adheres to the usual RCU locking and the rather pointless comment on why locking can be different there is removed (if it doesn't make anything simpler, why even bother?). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/fork.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..acc4cb62f32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); + cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: -- cgit v1.2.3-70-g09d2 From 32cdba1e05418909708a17e52505e8b2ba4381d1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 14 Nov 2012 19:03:42 +0100 Subject: uprobes: Use percpu_rw_semaphore to fix register/unregister vs dup_mmap() race This was always racy, but 268720903f87e0b84b161626c4447b81671b5d18 "uprobes: Rework register_for_each_vma() to make it O(n)" should be blamed anyway, it made everything worse and I didn't notice. register/unregister call build_map_info() and then do install/remove breakpoint for every mm which mmaps inode/offset. This can obviously race with fork()->dup_mmap() in between and we can miss the child. uprobe_register() could be easily fixed but unregister is much worse, the new mm inherits "int3" from parent and there is no way to detect this if uprobe goes away. So this patch simply adds percpu_down_read/up_read around dup_mmap(), and percpu_down_write/up_write into register_for_each_vma(). This adds 2 new hooks into dup_mmap() but we can kill uprobe_dup_mmap() and fold it into uprobe_end_dup_mmap(). Reported-by: Srikar Dronamraju Acked-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 8 ++++++++ kernel/events/uprobes.c | 26 +++++++++++++++++++++++--- kernel/fork.c | 2 ++ 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 2615c4d7788..4f628a6fc5b 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -97,6 +97,8 @@ extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_con extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void uprobe_start_dup_mmap(void); +extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); extern void uprobe_copy_process(struct task_struct *t); @@ -127,6 +129,12 @@ static inline void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } +static inline void uprobe_start_dup_mmap(void) +{ +} +static inline void uprobe_end_dup_mmap(void) +{ +} static inline void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5ce99cfd2e6..dea7acfbb07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -33,6 +33,7 @@ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ +#include #include @@ -71,6 +72,8 @@ static struct mutex uprobes_mutex[UPROBES_HASH_SZ]; static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) +static struct percpu_rw_semaphore dup_mmap_sem; + /* * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe * events active at this time. Probably a fine grained per inode count is @@ -766,10 +769,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) struct map_info *info; int err = 0; + percpu_down_write(&dup_mmap_sem); info = build_map_info(uprobe->inode->i_mapping, uprobe->offset, is_register); - if (IS_ERR(info)) - return PTR_ERR(info); + if (IS_ERR(info)) { + err = PTR_ERR(info); + goto out; + } while (info) { struct mm_struct *mm = info->mm; @@ -799,7 +805,8 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) mmput(mm); info = free_map_info(info); } - + out: + percpu_up_write(&dup_mmap_sem); return err; } @@ -1131,6 +1138,16 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } +void uprobe_start_dup_mmap(void) +{ + percpu_down_read(&dup_mmap_sem); +} + +void uprobe_end_dup_mmap(void) +{ + percpu_up_read(&dup_mmap_sem); +} + void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { newmm->uprobes_state.xol_area = NULL; @@ -1597,6 +1614,9 @@ static int __init init_uprobes(void) mutex_init(&uprobes_mmap_mutex[i]); } + if (percpu_init_rwsem(&dup_mmap_sem)) + return -ENOMEM; + return register_die_notifier(&uprobe_exception_nb); } module_init(init_uprobes); diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..c497e57aa65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -352,6 +352,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) unsigned long charge; struct mempolicy *pol; + uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); @@ -469,6 +470,7 @@ out: up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); + uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: mpol_put(pol); -- cgit v1.2.3-70-g09d2 From 17cf22c33e1f1b5e435469c84e43872579497653 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 14:51:53 -0800 Subject: pidns: Use task_active_pid_ns where appropriate The expressions tsk->nsproxy->pid_ns and task_active_pid_ns aka ns_of_pid(task_pid(tsk)) should have the same number of cache line misses with the practical difference that ns_of_pid(task_pid(tsk)) is released later in a processes life. Furthermore by using task_active_pid_ns it becomes trivial to write an unshare implementation for the the pid namespace. So I have used task_active_pid_ns everywhere I can. In fork since the pid has not yet been attached to the process I use ns_of_pid, to achieve the same effect. Signed-off-by: Eric W. Biederman --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- drivers/staging/android/binder.c | 3 ++- fs/hppfs/hppfs.c | 2 +- fs/proc/root.c | 2 +- kernel/cgroup.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 2 +- kernel/nsproxy.c | 2 +- kernel/pid.c | 8 ++++---- kernel/signal.c | 2 +- kernel/sysctl_binary.c | 2 +- 12 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 965d381abd7..25db92a8e1c 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -1094,7 +1094,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private) LOAD_INT(c), LOAD_FRAC(c), count_active_contexts(), atomic_read(&nr_spu_contexts), - current->nsproxy->pid_ns->last_pid); + task_active_pid_ns(current)->last_pid); return 0; } diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 79ccfe6c707..7fc71c62826 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -123,7 +123,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt; + struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; char *buf; int len; struct file *file; diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 5d4610babd8..a97bbcd1c9e 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "binder.h" @@ -2344,7 +2345,7 @@ retry: if (t->from) { struct task_struct *sender = t->from->proc->tsk; tr.sender_pid = task_tgid_nr_ns(sender, - current->nsproxy->pid_ns); + task_active_pid_ns(current)); } else { tr.sender_pid = 0; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 78f21f8dc2e..43b315f2002 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -710,7 +710,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) struct vfsmount *proc_mnt; int err = -ENOENT; - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt); + proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); if (IS_ERR(proc_mnt)) goto out; diff --git a/fs/proc/root.c b/fs/proc/root.c index 13ef6247e7a..fc1609321a7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -106,7 +106,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = (struct pid_namespace *)data; options = NULL; } else { - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); options = data; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620d..0dbfba2efa7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3390,7 +3390,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = current->nsproxy->pid_ns; + struct pid_namespace *ns = task_active_pid_ns(current); /* * We can't drop the pidlist_mutex before taking the l->mutex in case diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134..738f3564e83 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6155,7 +6155,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->parent = parent_event; - event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->ns = get_pid_ns(task_active_pid_ns(current)); event->id = atomic64_inc_return(&perf_event_id); event->state = PERF_EVENT_STATE_INACTIVE; diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..7798c247f4b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1442,7 +1442,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (thread_group_leader(p)) { if (is_child_reaper(pid)) - p->nsproxy->pid_ns->child_reaper = p; + ns_of_pid(pid)->child_reaper = p; p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ca27d2c5264..acc92680381 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -84,7 +84,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ipc; } - new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), task_active_pid_ns(tsk)); + new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns); if (IS_ERR(new_nsp->pid_ns)) { err = PTR_ERR(new_nsp->pid_ns); goto out_pid; diff --git a/kernel/pid.c b/kernel/pid.c index 2a624f1486e..3a5f238c1ca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -345,7 +345,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { - return find_pid_ns(nr, current->nsproxy->pid_ns); + return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); @@ -429,7 +429,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) struct task_struct *find_task_by_vpid(pid_t vnr) { - return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); + return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) @@ -484,7 +484,7 @@ EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { - return pid_nr_ns(pid, current->nsproxy->pid_ns); + return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); @@ -495,7 +495,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) - ns = current->nsproxy->pid_ns; + ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d..b2445d86f22 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1752,7 +1752,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); - info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 65bdcf198d4..5a638445050 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1344,7 +1344,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, goto out_putname; } - mnt = current->nsproxy->pid_ns->proc_mnt; + mnt = task_active_pid_ns(current)->proc_mnt; file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) -- cgit v1.2.3-70-g09d2 From 0a01f2cc390e10633a54f72c608cc3fe19a50c3d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 1 Aug 2012 10:33:47 -0700 Subject: pidns: Make the pidns proc mount/umount logic obvious. Track the number of pids in the proc hash table. When the number of pids goes to 0 schedule work to unmount the kernel mount of proc. Move the mount of proc into alloc_pid when we allocate the pid for init. Remove the surprising calls of pid_ns_release proc in fork and proc_flush_task. Those code paths really shouldn't know about proc namespace implementation details and people have demonstrated several times that finding and understanding those code paths is difficult and non-obvious. Because of the call path detach pid is alwasy called with the rtnl_lock held free_pid is not allowed to sleep, so the work to unmounting proc is moved to a work queue. This has the side benefit of not blocking the entire world waiting for the unnecessary rcu_barrier in deactivate_locked_super. In the process of making the code clear and obvious this fixes a bug reported by Gao feng where we would leak a mount of proc during clone(CLONE_NEWPID|CLONE_NEWNET) if copy_pid_ns succeeded and copy_net_ns failed. Acked-by: "Serge E. Hallyn" Signed-off-by: "Eric W. Biederman" --- fs/proc/base.c | 4 ---- fs/proc/root.c | 5 ----- include/linux/pid_namespace.h | 2 ++ kernel/fork.c | 2 -- kernel/pid.c | 21 +++++++++++++++++---- kernel/pid_namespace.c | 14 +++++++------- 6 files changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel/fork.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 6177fc238fd..7621dc51cff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2590,10 +2590,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index fc1609321a7..f2f251158d3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -155,11 +155,6 @@ void __init proc_root_init(void) err = register_filesystem(&proc_fs_type); if (err) return; - err = pid_ns_prepare_proc(&init_pid_ns); - if (err) { - unregister_filesystem(&proc_fs_type); - return; - } proc_self_init(); proc_symlink("mounts", NULL, "self/mounts"); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c89c9cfcd24..4c96acdb248 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -21,6 +21,7 @@ struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; int last_pid; + int nr_hashed; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; @@ -32,6 +33,7 @@ struct pid_namespace { struct bsd_acct_struct *bacct; #endif struct user_namespace *user_ns; + struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ diff --git a/kernel/fork.c b/kernel/fork.c index 7798c247f4b..666dc8b0660 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1476,8 +1476,6 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - if (unlikely(clone_flags & CLONE_NEWPID)) - pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) diff --git a/kernel/pid.c b/kernel/pid.c index 3a5f238c1ca..e957f8b0913 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -36,6 +36,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -270,8 +271,12 @@ void free_pid(struct pid *pid) unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - hlist_del_rcu(&pid->numbers[i].pid_chain); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + hlist_del_rcu(&upid->pid_chain); + if (--upid->ns->nr_hashed == 0) + schedule_work(&upid->ns->proc_work); + } spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) @@ -293,6 +298,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) goto out; tmp = ns; + pid->level = ns->level; for (i = ns->level; i >= 0; i--) { nr = alloc_pidmap(tmp); if (nr < 0) @@ -303,17 +309,23 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } + if (unlikely(is_child_reaper(pid))) { + if (pid_ns_prepare_proc(ns)) + goto out_free; + } + get_pid_ns(ns); - pid->level = ns->level; atomic_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); - for ( ; upid >= pid->numbers; --upid) + for ( ; upid >= pid->numbers; --upid) { hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + upid->ns->nr_hashed++; + } spin_unlock_irq(&pidmap_lock); out: @@ -570,6 +582,7 @@ void __init pidmap_init(void) /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); + init_pid_ns.nr_hashed = 1; init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index b2604950aa5..84591cfeefc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -72,6 +72,12 @@ err_alloc: return NULL; } +static void proc_cleanup_work(struct work_struct *work) +{ + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); + pid_ns_release_proc(ns); +} + /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 @@ -105,6 +111,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); + INIT_WORK(&ns->proc_work, proc_cleanup_work); set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); @@ -112,15 +119,8 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - err = pid_ns_prepare_proc(ns); - if (err) - goto out_put_parent_pid_ns; - return ns; -out_put_parent_pid_ns: - put_pid_ns(parent_pid_ns); - put_user_ns(user_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: -- cgit v1.2.3-70-g09d2 From 1c4042c29bd2e85aac4110552ca8ade763762e84 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Jul 2010 17:10:36 -0700 Subject: pidns: Consolidate initialzation of special init task state Instead of setting child_reaper and SIGNAL_UNKILLABLE one way for the system init process, and another way for pid namespace init processes test pid->nr == 1 and use the same code for both. For the global init this results in SIGNAL_UNKILLABLE being set much earlier in the initialization process. This is a small cleanup and it paves the way for allowing unshare and enter of the pid namespace as that path like our global init also will not set CLONE_NEWPID. Signed-off-by: Eric W. Biederman --- init/main.c | 1 - kernel/fork.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/init/main.c b/init/main.c index 9cf77ab138a..317750a18f7 100644 --- a/init/main.c +++ b/init/main.c @@ -810,7 +810,6 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); - current->signal->flags |= SIGNAL_UNKILLABLE; flush_delayed_fput(); if (ramdisk_execute_command) { diff --git a/kernel/fork.c b/kernel/fork.c index 666dc8b0660..0f2bbce311f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1039,8 +1039,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->live, 1); atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); - if (clone_flags & CLONE_NEWPID) - sig->flags |= SIGNAL_UNKILLABLE; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -1441,8 +1439,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { - if (is_child_reaper(pid)) + if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; + p->signal->flags |= SIGNAL_UNKILLABLE; + } p->signal->leader_pid = pid; p->signal->tty = tty_kref_get(current->signal->tty); -- cgit v1.2.3-70-g09d2 From 50804fe3737ca6a5942fdc2057a18a8141d00141 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 15:41:50 -0800 Subject: pidns: Support unsharing the pid namespace. Unsharing of the pid namespace unlike unsharing of other namespaces does not take affect immediately. Instead it affects the children created with fork and clone. The first of these children becomes the init process of the new pid namespace, the rest become oddball children of pid 0. From the point of view of the new pid namespace the process that created it is pid 0, as it's pid does not map. A couple of different semantics were considered but this one was settled on because it is easy to implement and it is usable from pam modules. The core reasons for the existence of unshare. I took a survey of the callers of pam modules and the following appears to be a representative sample of their logic. { setup stuff include pam child = fork(); if (!child) { setuid() exec /bin/bash } waitpid(child); pam and other cleanup } As you can see there is a fork to create the unprivileged user space process. Which means that the unprivileged user space process will appear as pid 1 in the new pid namespace. Further most login processes do not cope with extraneous children which means shifting the duty of reaping extraneous child process to the creator of those extraneous children makes the system more comprehensible. The practical reason for this set of pid namespace semantics is that it is simple to implement and verify they work correctly. Whereas an implementation that requres changing the struct pid on a process comes with a lot more races and pain. Not the least of which is that glibc caches getpid(). These semantics are implemented by having two notions of the pid namespace of a proces. There is task_active_pid_ns which is the pid namspace the process was created with and the pid namespace that all pids are presented to that process in. The task_active_pid_ns is stored in the struct pid of the task. Then there is the pid namespace that will be used for children that pid namespace is stored in task->nsproxy->pid_ns. Signed-off-by: Eric W. Biederman --- kernel/fork.c | 32 +++++++++++++++++++++++++------- kernel/nsproxy.c | 2 +- kernel/pid_namespace.c | 2 -- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0f2bbce311f..811ffbad788 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1565,9 +1565,11 @@ long do_fork(unsigned long clone_flags, * Do some preliminary argument and permissions checking before we * actually start allocating stuff */ - if (clone_flags & CLONE_NEWUSER) { - if (clone_flags & CLONE_THREAD) + if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { + if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; + } + if (clone_flags & CLONE_NEWUSER) { /* hopefully this check will go away when userns support is * complete */ @@ -1692,7 +1694,8 @@ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| + CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1763,15 +1766,30 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) int do_sysvsem = 0; int err; - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; - + /* + * If unsharing a pid namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWPID) + unshare_flags |= CLONE_THREAD; + /* + * If unsharing a thread from a thread group, must also unshare vm. + */ + if (unshare_flags & CLONE_THREAD) + unshare_flags |= CLONE_VM; + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (unshare_flags & CLONE_VM) + unshare_flags |= CLONE_SIGHAND; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index acc92680381..b8d4d8709d7 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -188,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET))) + CLONE_NEWNET | CLONE_NEWPID))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f78fc48c86b..68508d33063 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -144,8 +144,6 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & (CLONE_THREAD|CLONE_PARENT)) - return ERR_PTR(-EINVAL); if (task_active_pid_ns(current) != old_ns) return ERR_PTR(-EINVAL); return create_pid_namespace(user_ns, old_ns); -- cgit v1.2.3-70-g09d2 From 5eaf563e53294d6696e651466697eb9d491f3946 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 21 Nov 2011 17:22:31 -0800 Subject: userns: Allow unprivileged users to create user namespaces. Now that we have been through every permission check in the kernel having uid == 0 and gid == 0 in your local user namespace no longer adds any special privileges. Even having a full set of caps in your local user namespace is safe because capabilies are relative to your local user namespace, and do not confer unexpected privileges. Over the long term this should allow much more of the kernels functionality to be safely used by non-root users. Functionality like unsharing the mount namespace that is only unsafe because it can fool applications whose privileges are raised when they are executed. Since those applications have no privileges in a user namespaces it becomes safe to spoof and confuse those applications all you want. Those capabilities will still need to be enabled carefully because we may still need things like rlimits on the number of unprivileged mounts but that is to avoid DOS attacks not to avoid fooling root owned processes. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 811ffbad788..8c29abb1901 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1569,14 +1569,6 @@ long do_fork(unsigned long clone_flags, if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) return -EINVAL; } - if (clone_flags & CLONE_NEWUSER) { - /* hopefully this check will go away when userns support is - * complete - */ - if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || - !capable(CAP_SETGID)) - return -EPERM; - } /* * Determine whether and which event to report to ptracer. When -- cgit v1.2.3-70-g09d2 From b2e0d98705e60e45bbb3c0032c48824ad7ae0704 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 26 Jul 2012 05:15:35 -0700 Subject: userns: Implement unshare of the user namespace - Add CLONE_THREAD to the unshare flags if CLONE_NEWUSER is selected As changing user namespaces is only valid if all there is only a single thread. - Restore the code to add CLONE_VM if CLONE_THREAD is selected and the code to addCLONE_SIGHAND if CLONE_VM is selected. Making the constraints in the code clear. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- include/linux/nsproxy.h | 2 +- include/linux/user_namespace.h | 9 +++++++++ kernel/fork.c | 25 ++++++++++++++++++++++--- kernel/nsproxy.c | 8 ++++---- kernel/user_namespace.c | 15 +++++++++++++++ 5 files changed, 51 insertions(+), 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index cc37a55ad00..10e5947491c 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -67,7 +67,7 @@ void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, - struct fs_struct *); + struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 95142cae446..17651f08d67 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -39,6 +39,7 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns) } extern int create_user_ns(struct cred *new); +extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void free_user_ns(struct kref *kref); static inline void put_user_ns(struct user_namespace *ns) @@ -66,6 +67,14 @@ static inline int create_user_ns(struct cred *new) return -EINVAL; } +static inline int unshare_userns(unsigned long unshare_flags, + struct cred **new_cred) +{ + if (unshare_flags & CLONE_NEWUSER) + return -EINVAL; + return 0; +} + static inline void put_user_ns(struct user_namespace *ns) { } diff --git a/kernel/fork.c b/kernel/fork.c index 8c29abb1901..38e53b87402 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1687,7 +1687,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing to @@ -1754,10 +1754,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *fd, *new_fd = NULL; + struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; + /* + * If unsharing a user namespace must also unshare the thread. + */ + if (unshare_flags & CLONE_NEWUSER) + unshare_flags |= CLONE_THREAD; /* * If unsharing a pid namespace must also unshare the thread. */ @@ -1795,11 +1801,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; - err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_cred, new_fs); + if (err) + goto bad_unshare_cleanup_cred; - if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1832,11 +1842,20 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } task_unlock(current); + + if (new_cred) { + /* Install the new user namespace */ + commit_creds(new_cred); + new_cred = NULL; + } } if (new_nsproxy) put_nsproxy(new_nsproxy); +bad_unshare_cleanup_cred: + if (new_cred) + put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 2ddd81657a2..78e2ecb2016 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -186,7 +186,7 @@ void free_nsproxy(struct nsproxy *ns) * On success, returns the new nsproxy. */ int unshare_nsproxy_namespaces(unsigned long unshare_flags, - struct nsproxy **new_nsp, struct fs_struct *new_fs) + struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs) { struct user_namespace *user_ns; int err = 0; @@ -195,12 +195,12 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, CLONE_NEWNET | CLONE_NEWPID))) return 0; - if (!nsown_capable(CAP_SYS_ADMIN)) + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - user_ns = current_user_ns(); *new_nsp = create_new_namespaces(unshare_flags, current, user_ns, - new_fs ? new_fs : current->fs); + new_fs ? new_fs : current->fs); if (IS_ERR(*new_nsp)) { err = PTR_ERR(*new_nsp); goto out; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9460774e77..ce92f7e6290 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -82,6 +82,21 @@ int create_user_ns(struct cred *new) return 0; } +int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) +{ + struct cred *cred; + + if (!(unshare_flags & CLONE_NEWUSER)) + return 0; + + cred = prepare_creds(); + if (!cred) + return -ENOMEM; + + *new_cred = cred; + return create_user_ns(cred); +} + void free_user_ns(struct kref *kref) { struct user_namespace *parent, *ns = -- cgit v1.2.3-70-g09d2 From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- include/linux/sched.h | 23 +++++++++++++++++++---- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++++----------------------- 3 files changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index e75cab5820a..5dafac36681 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -433,14 +433,29 @@ struct cpu_itimer { u32 incr_error; }; +/** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -581,7 +596,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1340,7 +1355,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..0e7cdb90476 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b9..220fdc4db77 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif -- cgit v1.2.3-70-g09d2 From d2125043aebf7f53cd1c72115c17b01d0bc06ce1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 23 Oct 2012 13:17:59 -0400 Subject: generic sys_fork / sys_vfork / sys_clone ... and get rid of idiotic struct pt_regs * in asm-generic/syscalls.h prototypes of the same, while we are at it. Eventually we want those in linux/syscalls.h, of course, but that'll have to wait a bit. Note that there are *three* variants of sys_clone() order of arguments. Braindamage galore... Signed-off-by: Al Viro --- arch/Kconfig | 14 ++++++++++++++ include/asm-generic/syscalls.h | 7 +++---- kernel/fork.c | 43 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/Kconfig b/arch/Kconfig index 366ec06a518..8d698fb5ccc 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -341,4 +341,18 @@ config MODULES_USE_ELF_REL Modules only use ELF REL relocations. Modules with ELF RELA relocations will give an error. +# +# ABI hall of shame +# +config CLONE_BACKWARDS + bool + help + Architecture has tls passed as the 4th argument of clone(2), + not the 5th one. + +config CLONE_BACKWARDS2 + bool + help + Architecture has the first two arguments of clone(2) swapped. + source "kernel/gcov/Kconfig" diff --git a/include/asm-generic/syscalls.h b/include/asm-generic/syscalls.h index d89dec864d4..7e4fdb64995 100644 --- a/include/asm-generic/syscalls.h +++ b/include/asm-generic/syscalls.h @@ -10,16 +10,15 @@ */ #ifndef sys_clone asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, - void __user *parent_tid, void __user *child_tid, - struct pt_regs *regs); + void __user *parent_tid, void __user *child_tid); #endif #ifndef sys_fork -asmlinkage long sys_fork(struct pt_regs *regs); +asmlinkage long sys_fork(void); #endif #ifndef sys_vfork -asmlinkage long sys_vfork(struct pt_regs *regs); +asmlinkage long sys_vfork(void); #endif #ifndef sys_execve diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..27a337549da 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1645,6 +1645,49 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) } #endif +#ifdef __ARCH_WANT_SYS_FORK +SYSCALL_DEFINE0(fork) +{ +#ifdef CONFIG_MMU + return do_fork(SIGCHLD, 0, current_pt_regs(), 0, NULL, NULL); +#else + /* can not support in nommu mode */ + return(-EINVAL); +#endif +} +#endif + +#ifdef __ARCH_WANT_SYS_VFORK +SYSCALL_DEFINE0(vfork) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, current_pt_regs(), + 0, NULL, NULL); +} +#endif + +#ifdef __ARCH_WANT_SYS_CLONE +#ifdef CONFIG_CLONE_BACKWARDS +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int, tls_val, + int __user *, child_tidptr) +#elif defined(CONFIG_CLONE_BACKWARDS2) +SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#else +SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, + int __user *, parent_tidptr, + int __user *, child_tidptr, + int, tls_val) +#endif +{ + return do_fork(clone_flags, newsp, current_pt_regs(), 0, + parent_tidptr, child_tidptr); +} +#endif + #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -- cgit v1.2.3-70-g09d2 From afa86fc426ff7e7f5477f15da9c405d08d5cf790 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:51:14 -0400 Subject: flagday: don't pass regs to copy_thread() Signed-off-by: Al Viro --- arch/alpha/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 3 +-- arch/avr32/kernel/process.c | 2 +- arch/blackfin/kernel/process.c | 6 +++--- arch/c6x/kernel/process.c | 2 +- arch/cris/arch-v10/kernel/process.c | 3 +-- arch/cris/arch-v32/kernel/process.c | 3 +-- arch/frv/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/hexagon/kernel/process.c | 3 +-- arch/ia64/kernel/process.c | 3 ++- arch/m32r/kernel/process.c | 2 +- arch/m68k/kernel/process.c | 3 +-- arch/microblaze/kernel/process.c | 3 +-- arch/mips/kernel/process.c | 4 ++-- arch/mn10300/kernel/process.c | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 5 ++--- arch/powerpc/kernel/process.c | 4 ++-- arch/s390/kernel/process.c | 3 +-- arch/score/kernel/process.c | 4 ++-- arch/sh/kernel/process_32.c | 3 +-- arch/sh/kernel/process_64.c | 5 ++--- arch/sparc/kernel/process_32.c | 5 ++--- arch/sparc/kernel/process_64.c | 4 ++-- arch/tile/kernel/process.c | 5 ++--- arch/um/kernel/process.c | 3 +-- arch/unicore32/kernel/process.c | 2 +- arch/x86/kernel/process_32.c | 3 +-- arch/x86/kernel/process_64.c | 3 +-- arch/xtensa/kernel/process.c | 3 +-- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- 34 files changed, 45 insertions(+), 60 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e9705bcc96f..b5d0d092369 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -241,7 +241,7 @@ release_thread(struct task_struct *dead_task) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *wontuse) + struct task_struct *p) { extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 4ab80bbb6d9..9800338c5d1 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,7 +376,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 5a1335caf6f..cb0956bc96e 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -234,8 +234,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, - struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); unsigned long tls = p->thread.tp_value; diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index 03d7aa4a4bc..fd78f58ea79 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -299,7 +299,7 @@ asmlinkage void syscall_return(void); int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index e5ae8fcab43..582276efaaa 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -141,14 +141,14 @@ asmlinkage int bfin_clone(unsigned long clone_flags, unsigned long newsp) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, - struct task_struct *p, struct pt_regs *regs) + struct task_struct *p) { struct pt_regs *childregs; unsigned long *v; childregs = (struct pt_regs *) (task_stack_page(p) + THREAD_SIZE) - 1; v = ((unsigned long *)childregs) - 2; - if (unlikely(!regs)) { + if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); v[0] = usp; v[1] = topstk; @@ -157,7 +157,7 @@ copy_thread(unsigned long clone_flags, __asm__ __volatile__("%0 = syscfg;":"=da"(childregs->syscfg):); p->thread.usp = 0; } else { - *childregs = *regs; + *childregs = *current_pt_regs(); childregs->r0 = 0; p->thread.usp = usp ? : rdusp(); v[0] = v[1] = 0; diff --git a/arch/c6x/kernel/process.c b/arch/c6x/kernel/process.c index a3f91895e8b..6434df476f7 100644 --- a/arch/c6x/kernel/process.c +++ b/arch/c6x/kernel/process.c @@ -139,7 +139,7 @@ void start_thread(struct pt_regs *regs, unsigned int pc, unsigned long usp) */ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long ustk_size, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs; diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 520547c8b19..b1018750cff 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -94,8 +94,7 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct switch_stack *swstack = ((struct switch_stack *)childregs) - 1; diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index 331e70252df..2b23ef0e445 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -109,8 +109,7 @@ extern asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct switch_stack *swstack = ((struct switch_stack *) childregs) - 1; diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 0039bf77b19..23916b2a12a 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -144,7 +144,7 @@ inline unsigned long user_stack(const struct pt_regs *regs) */ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct pt_regs *childregs; diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index b0fb4054aee..b609f63f159 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -129,7 +129,7 @@ void flush_thread(void) int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long topstk, - struct task_struct * p, struct pt_regs *unused) + struct task_struct * p) { struct pt_regs * childregs; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 36dce17ed25..06ae9ffcabd 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -87,8 +87,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) * Copy architecture-specific thread state */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, - struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 25543a295ad..31360cbbd5f 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -393,12 +393,13 @@ ia64_load_extra (struct task_struct *task) int copy_thread(unsigned long clone_flags, unsigned long user_stack_base, unsigned long user_stack_size, - struct task_struct *p, struct pt_regs *regs) + struct task_struct *p) { extern char ia64_ret_from_clone; struct switch_stack *child_stack, *stack; unsigned long rbs, child_rbs, rbs_size; struct pt_regs *child_ptregs; + struct pt_regs *regs = current_pt_regs(); int retval = 0; child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index c37e9a9a8f2..765d0f57c78 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -192,7 +192,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) } int copy_thread(unsigned long clone_flags, unsigned long spu, - unsigned long arg, struct task_struct *tsk, struct pt_regs *unused) + unsigned long arg, struct task_struct *tsk) { struct pt_regs *childregs = task_pt_regs(tsk); extern void ret_from_fork(void); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index aa9b1100027..9a3df4df73c 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -154,8 +154,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct * p, struct pt_regs * unused) + unsigned long arg, struct task_struct *p) { struct fork_frame { struct switch_stack sw; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a5fed8db726..40823fd1db0 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -120,8 +120,7 @@ void flush_thread(void) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index d13720ac656..38097652d62 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -114,10 +114,10 @@ void flush_thread(void) } int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); unsigned long childksp; p->set_child_tid = p->clear_child_tid = NULL; diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 5e0ef396458..eb09f5a552f 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -206,7 +206,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(unsigned long clone_flags, unsigned long c_usp, unsigned long ustk_size, - struct task_struct *p, struct pt_regs *unused) + struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *c_regs; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 6b853668369..00c233bf0d0 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -142,7 +142,7 @@ extern asmlinkage void ret_from_fork(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *userregs; struct pt_regs *kregs; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 9753ecf49a0..d13507246c5 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -204,10 +204,9 @@ int dump_task_fpu (struct task_struct *tsk, elf_fpregset_t *r) int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { - struct pt_regs * cregs = &(p->thread.regs); + struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); /* We have to use void * instead of a function pointer, because diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a3143756763..81430674e71 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -733,8 +733,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p, - struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -759,6 +758,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, ti->flags |= _TIF_RESTOREALL; f = ret_from_kernel_thread; } else { + struct pt_regs *regs = current_pt_regs(); CHECK_FULL_REGS(regs); *childregs = *regs; if (usp) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e37677796a0..536d64579d9 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -117,8 +117,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long new_stackp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti; struct fake_frame diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index f96379a5aee..79568466b57 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -87,11 +87,11 @@ void flush_thread(void) {} * set up the kernel stack and exception frames for a new process */ int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); + struct pt_regs *regs = current_pt_regs(); p->thread.reg0 = (unsigned long) childregs; if (unlikely(p->flags & PF_KTHREAD)) { diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 1786d16b6c6..73eb66fc625 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -128,8 +128,7 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index d5c86a8a384..e611c85144b 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -371,10 +371,9 @@ asmlinkage void ret_from_fork(void); asmlinkage void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); #ifdef CONFIG_SH_FPU /* can't happen for a kernel thread */ diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index bf4c6addce7..ecde946ef83 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -319,11 +319,10 @@ extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *ti = task_thread_info(p); - struct pt_regs *childregs; + struct pt_regs *childregs, *regs = current_pt_regs(); char *new_stack; #ifndef CONFIG_SMP diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index dff54f46728..58ef19e7e82 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -622,10 +622,10 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, * Child --> %o0 == parents pid, %o1 == 1 */ int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { struct thread_info *t = task_thread_info(p); + struct pt_regs *regs = current_pt_regs(); struct sparc_stackf *parent_sf; unsigned long child_stack_sz; char *child_trap_frame; diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 267936b51b5..0e5661e7d00 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -157,10 +157,9 @@ void arch_release_thread_info(struct thread_info *info) static void save_arch_state(struct thread_struct *t); int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { - struct pt_regs *childregs = task_pt_regs(p); + struct pt_regs *childregs = task_pt_regs(p), *regs = current_pt_regs(); unsigned long ksp; unsigned long *callee_regs; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index c502c804e8b..b462b13c5ba 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -161,8 +161,7 @@ void fork_handler(void) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct * p, - struct pt_regs *regs) + unsigned long arg, struct task_struct * p) { void (*handler)(void); int kthread = current->flags & PF_KTHREAD; diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c index 79e44e8ae31..62bad9fed03 100644 --- a/arch/unicore32/kernel/process.c +++ b/arch/unicore32/kernel/process.c @@ -262,7 +262,7 @@ asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p, struct pt_regs *unused) + unsigned long stk_sz, struct task_struct *p) { struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 16efa974532..b5a8905785e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -128,8 +128,7 @@ void release_thread(struct task_struct *dead_task) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); struct task_struct *tsk; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 74aac76c6e3..6e68a619496 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -146,8 +146,7 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls) } int copy_thread(unsigned long clone_flags, unsigned long sp, - unsigned long arg, - struct task_struct *p, struct pt_regs *regs) + unsigned long arg, struct task_struct *p) { int err; struct pt_regs *childregs; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 0036c14739f..1accf28da5f 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -199,8 +199,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) */ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, - unsigned long thread_fn_arg, - struct task_struct *p, struct pt_regs *unused) + unsigned long thread_fn_arg, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); diff --git a/include/linux/sched.h b/include/linux/sched.h index c57249782e4..78a2ae3470d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2271,7 +2271,7 @@ extern void mm_release(struct task_struct *, struct mm_struct *); extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(unsigned long, unsigned long, unsigned long, - struct task_struct *, struct pt_regs *); + struct task_struct *); extern void flush_thread(void); extern void exit_thread(void); diff --git a/kernel/fork.c b/kernel/fork.c index 27a337549da..d96a562b131 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1320,7 +1320,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); + retval = copy_thread(clone_flags, stack_start, stack_size, p); if (retval) goto bad_fork_cleanup_io; -- cgit v1.2.3-70-g09d2 From 62e791c1b8ea481c72c299dee4f62c04aaef765c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:52:26 -0400 Subject: don't pass regs to copy_process() Signed-off-by: Al Viro --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d96a562b131..fa24a78b94f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1127,7 +1127,6 @@ static void posix_cpu_timers_init(struct task_struct *tsk) */ static struct task_struct *copy_process(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, @@ -1536,8 +1535,7 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct task_struct *task; struct pt_regs regs; - task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid, 0); + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); init_idle(task, cpu); @@ -1596,7 +1594,7 @@ long do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, regs, stack_size, + p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer -- cgit v1.2.3-70-g09d2 From 18c26c27ae0abe82253cb2e2363df465dbbb657e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 22:53:20 -0400 Subject: death to idle_regs() Signed-off-by: Al Viro --- arch/ia64/kernel/smpboot.c | 5 ----- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/cpu/common.c | 9 --------- kernel/fork.c | 6 ------ 4 files changed, 22 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index 963d2db53bf..6a368cb2043 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -460,11 +460,6 @@ start_secondary (void *unused) return 0; } -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - return NULL; -} - static int __cpuinit do_boot_cpu (int sapicid, int cpu, struct task_struct *idle) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ad1fc851167..92f48a5a6b2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -178,8 +178,6 @@ static inline int hlt_works(int cpu) extern void cpu_detect(struct cpuinfo_x86 *c); -extern struct pt_regs *idle_regs(struct pt_regs *); - extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7505f7b13e7..6a6432cf89d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1173,15 +1173,6 @@ DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* Make sure %fs and %gs are initialized properly in idle threads */ -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - regs->fs = __KERNEL_PERCPU; - regs->gs = __KERNEL_STACK_CANARY; - - return regs; -} #endif /* CONFIG_X86_64 */ /* diff --git a/kernel/fork.c b/kernel/fork.c index fa24a78b94f..0e68b6686ac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1514,12 +1514,6 @@ fork_out: return ERR_PTR(retval); } -noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - return regs; -} - static inline void init_idle_pids(struct pid_link *links) { enum pid_type type; -- cgit v1.2.3-70-g09d2 From e80d6661c3a5caa0cebec0853c6cb0db090fb506 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 Oct 2012 23:10:08 -0400 Subject: flagday: kill pt_regs argument of do_fork() Signed-off-by: Al Viro --- arch/blackfin/kernel/process.c | 2 +- arch/ia64/kernel/entry.S | 14 ++++++-------- arch/m68k/kernel/process.c | 2 +- arch/mips/kernel/linux32.c | 2 +- arch/mips/kernel/syscall.c | 4 ++-- arch/sparc/kernel/process_32.c | 3 +-- arch/sparc/kernel/process_64.c | 3 +-- include/linux/sched.h | 2 +- kernel/fork.c | 13 +++++-------- 9 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 582276efaaa..3e16ad9b0a9 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -135,7 +135,7 @@ asmlinkage int bfin_clone(unsigned long clone_flags, unsigned long newsp) #endif if (newsp) newsp -= 12; - return do_fork(clone_flags, newsp, regs, 0, NULL, NULL); + return do_fork(clone_flags, newsp, 0, NULL, NULL); } int diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 940a6726362..e25b784a2b7 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -116,13 +116,12 @@ GLOBAL_ENTRY(sys_clone2) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=in2 + mov out2=in2 tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in3 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in5 // store TLS in r16 for copy_thread() - mov out5=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in4 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret1: .restore sp @@ -148,13 +147,12 @@ GLOBAL_ENTRY(sys_clone) mov loc1=r16 // save ar.pfs across do_fork .body mov out1=in1 - mov out3=16 // stacksize (compensates for 16-byte scratch area) + mov out2=16 // stacksize (compensates for 16-byte scratch area) tbit.nz p6,p0=in0,CLONE_SETTLS_BIT - mov out4=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID + mov out3=in2 // parent_tidptr: valid only w/CLONE_PARENT_SETTID ;; (p6) st8 [r2]=in4 // store TLS in r13 (tp) - mov out5=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID - adds out2=IA64_SWITCH_STACK_SIZE+16,sp // out2 = ®s + mov out4=in3 // child_tidptr: valid only w/CLONE_CHILD_SETTID or CLONE_CHILD_CLEARTID mov out0=in0 // out0 = clone_flags br.call.sptk.many rp=do_fork .ret2: .restore sp diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 9a3df4df73c..d538694ad20 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -149,7 +149,7 @@ void flush_thread(void) asmlinkage int m68k_clone(struct pt_regs *regs) { /* regs will be equal to current_pt_regs() */ - return do_fork(regs->d1, regs->d2, regs, 0, + return do_fork(regs->d1, regs->d2, 0, (int __user *)regs->d3, (int __user *)regs->d4); } diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 8796dbc7e35..7adab86c632 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -312,7 +312,7 @@ _sys32_clone(nabi_no_regargs struct pt_regs regs) /* Use __dummy4 instead of getting it off the stack, so that syscall() works. */ child_tidptr = (int __user *) __dummy4; - return do_fork(clone_flags, newsp, ®s, 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index c611e2df776..201cb76b4df 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -92,7 +92,7 @@ save_static_function(sys_fork); static int __used noinline _sys_fork(nabi_no_regargs struct pt_regs regs) { - return do_fork(SIGCHLD, regs.regs[29], ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.regs[29], 0, NULL, NULL); } save_static_function(sys_clone); @@ -123,7 +123,7 @@ _sys_clone(nabi_no_regargs struct pt_regs regs) #else child_tidptr = (int __user *) regs.regs[8]; #endif - return do_fork(clone_flags, newsp, ®s, 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index ecde946ef83..be8e862bada 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -286,8 +286,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags, parent_tid_ptr = regs->u_regs[UREG_I2]; child_tid_ptr = regs->u_regs[UREG_I4]; - ret = do_fork(clone_flags, stack_start, - regs, stack_size, + ret = do_fork(clone_flags, stack_start, stack_size, (int __user *) parent_tid_ptr, (int __user *) child_tid_ptr); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 58ef19e7e82..cdb80b2adbe 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -601,8 +601,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags, child_tid_ptr = (int __user *) regs->u_regs[UREG_I4]; } - ret = do_fork(clone_flags, stack_start, - regs, stack_size, + ret = do_fork(clone_flags, stack_start, stack_size, parent_tid_ptr, child_tid_ptr); /* If we get an error and potentially restart the system diff --git a/include/linux/sched.h b/include/linux/sched.h index 78a2ae3470d..1162258bcaf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2289,7 +2289,7 @@ extern int disallow_signal(int); extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *); -extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); #ifdef CONFIG_GENERIC_KERNEL_THREAD extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/kernel/fork.c b/kernel/fork.c index 0e68b6686ac..54073078343 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1527,8 +1527,6 @@ static inline void init_idle_pids(struct pid_link *links) struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; - struct pt_regs regs; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); if (!IS_ERR(task)) { init_idle_pids(task->pids); @@ -1546,7 +1544,6 @@ struct task_struct * __cpuinit fork_idle(int cpu) */ long do_fork(unsigned long clone_flags, unsigned long stack_start, - struct pt_regs *regs, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) @@ -1576,7 +1573,7 @@ long do_fork(unsigned long clone_flags, * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ - if (!(clone_flags & CLONE_UNTRACED) && likely(user_mode(regs))) { + if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if ((clone_flags & CSIGNAL) != SIGCHLD) @@ -1632,7 +1629,7 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, NULL, + return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } #endif @@ -1641,7 +1638,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return do_fork(SIGCHLD, 0, current_pt_regs(), 0, NULL, NULL); + return do_fork(SIGCHLD, 0, 0, NULL, NULL); #else /* can not support in nommu mode */ return(-EINVAL); @@ -1652,7 +1649,7 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, current_pt_regs(), + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, 0, NULL, NULL); } #endif @@ -1675,7 +1672,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, current_pt_regs(), 0, + return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); } #endif -- cgit v1.2.3-70-g09d2 From 5bca23035391928c4c7301835accca3551b96cc2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 22 Nov 2012 14:40:03 +0000 Subject: mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node Due to the fact that migrations are driven by the CPU a task is running on there is no point tracking NUMA faults until one task runs on a new node. This patch tracks the first node used by an address space. Until it changes, PTE scanning is disabled and no NUMA hinting faults are trapped. This should help workloads that are short-lived, do not care about NUMA placement or have bound themselves to a single node. This takes advantage of the logic in "mm: sched: numa: Implement slow start for working set sampling" to delay when the checks are made. This will take advantage of processes that set their CPU and node bindings early in their lifetime. It will also potentially allow any initial load balancing to take place. Signed-off-by: Mel Gorman --- include/linux/mm_types.h | 10 ++++++++++ kernel/fork.c | 3 +++ kernel/sched/fair.c | 18 ++++++++++++++++++ kernel/sched/features.h | 4 +++- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e850a23dd6e..197422a1598 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -418,10 +418,20 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; + + /* + * The first node a task was scheduled on. If a task runs on + * a different node than Make PTE Scan Go Now. + */ + int first_nid; #endif struct uprobes_state uprobes_state; }; +/* first nid will either be a valid NID or one of these values */ +#define NUMA_PTE_SCAN_INIT -1 +#define NUMA_PTE_SCAN_ACTIVE -2 + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..296ea308096 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -820,6 +820,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a02a2082e9..3e18f611a5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -860,6 +860,24 @@ void task_numa_work(struct callback_head *work) if (p->flags & PF_EXITING) return; + /* + * We do not care about task placement until a task runs on a node + * other than the first one used by the address space. This is + * largely because migrations are driven by what CPU the task + * is running on. If it's never scheduled on another node, it'll + * not migrate so why bother trapping the fault. + */ + if (mm->first_nid == NUMA_PTE_SCAN_INIT) + mm->first_nid = numa_node_id(); + if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { + /* Are we running on a new node yet? */ + if (numa_node_id() == mm->first_nid && + !sched_feat_numa(NUMA_FORCE)) + return; + + mm->first_nid = NUMA_PTE_SCAN_ACTIVE; + } + /* * Reset the scan period if enough time has gone by. Objective is that * scanning will be reduced if pages are properly placed. As tasks diff --git a/kernel/sched/features.h b/kernel/sched/features.h index d2373a3e325..e7c25fff1e9 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -65,8 +65,10 @@ SCHED_FEAT(LB_MIN, false) /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= + * numa_balancing=. Allow PTE scanning to be forced on UMA machines + * for debugging the core machinery. */ #ifdef CONFIG_NUMA_BALANCING SCHED_FEAT(NUMA, false) +SCHED_FEAT(NUMA_FORCE, false) #endif -- cgit v1.2.3-70-g09d2 From 2ad306b17c0ac5a1b1f250d5f772aeb87fdf1eba Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:18 -0800 Subject: fork: protect architectures where THREAD_SIZE >= PAGE_SIZE against fork bombs Because those architectures will draw their stacks directly from the page allocator, rather than the slab cache, we can directly pass __GFP_KMEMCG flag, and issue the corresponding free_pages. This code path is taken when the architecture doesn't define CONFIG_ARCH_THREAD_INFO_ALLOCATOR (only ia64 seems to), and has THREAD_SIZE >= PAGE_SIZE. Luckily, most - if not all - of the remaining architectures fall in this category. This will guarantee that every stack page is accounted to the memcg the process currently lives on, and will have the allocations to fail if they go over limit. For the time being, I am defining a new variant of THREADINFO_GFP, not to mess with the other path. Once the slab is also tracked by memcg, we can get rid of that flag. Tested to successfully protect against :(){ :|:& };: Signed-off-by: Glauber Costa Acked-by: Frederic Weisbecker Acked-by: Kamezawa Hiroyuki Reviewed-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Greg Thelen Cc: Johannes Weiner Cc: JoonSoo Kim Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Suleiman Souhlal Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thread_info.h | 2 ++ kernel/fork.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ccc1899bd62..e7e04736802 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,6 +61,8 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif +#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) + /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/kernel/fork.c b/kernel/fork.c index c36c4e301ef..85f6d536608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP, + struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; @@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - free_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; -- cgit v1.2.3-70-g09d2 From ae903caae267154de7cf8576b130ff474630596b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Dec 2012 12:44:11 -0500 Subject: Bury the conditionals from kernel_thread/kernel_execve series All architectures have CONFIG_GENERIC_KERNEL_THREAD CONFIG_GENERIC_KERNEL_EXECVE __ARCH_WANT_SYS_EXECVE None of them have __ARCH_WANT_KERNEL_EXECVE and there are only two callers of kernel_execve() (which is a trivial wrapper for do_execve() now) left. Kill the conditionals and make both callers use do_execve(). Signed-off-by: Al Viro --- arch/Kconfig | 6 ------ arch/alpha/Kconfig | 2 -- arch/alpha/include/asm/unistd.h | 1 - arch/arm/Kconfig | 2 -- arch/arm/include/asm/unistd.h | 1 - arch/arm64/Kconfig | 2 -- arch/arm64/include/asm/unistd.h | 1 - arch/avr32/Kconfig | 2 -- arch/avr32/include/asm/unistd.h | 1 - arch/blackfin/Kconfig | 2 -- arch/blackfin/include/asm/unistd.h | 1 - arch/c6x/Kconfig | 2 -- arch/c6x/include/uapi/asm/unistd.h | 1 - arch/cris/Kconfig | 2 -- arch/cris/include/asm/unistd.h | 1 - arch/frv/Kconfig | 2 -- arch/frv/include/asm/unistd.h | 1 - arch/h8300/Kconfig | 2 -- arch/h8300/include/asm/unistd.h | 1 - arch/hexagon/Kconfig | 2 -- arch/hexagon/include/uapi/asm/unistd.h | 1 - arch/ia64/Kconfig | 2 -- arch/ia64/include/asm/unistd.h | 1 - arch/m32r/Kconfig | 2 -- arch/m32r/include/asm/unistd.h | 1 - arch/m68k/Kconfig | 2 -- arch/m68k/include/asm/unistd.h | 1 - arch/microblaze/Kconfig | 2 -- arch/microblaze/include/asm/unistd.h | 1 - arch/mips/Kconfig | 2 -- arch/mips/include/asm/unistd.h | 1 - arch/mn10300/Kconfig | 2 -- arch/mn10300/include/asm/unistd.h | 1 - arch/openrisc/Kconfig | 2 -- arch/openrisc/include/uapi/asm/unistd.h | 1 - arch/parisc/Kconfig | 2 -- arch/parisc/include/asm/unistd.h | 1 - arch/powerpc/Kconfig | 2 -- arch/powerpc/include/asm/unistd.h | 1 - arch/s390/Kconfig | 2 -- arch/s390/include/asm/unistd.h | 1 - arch/score/Kconfig | 2 -- arch/score/include/asm/unistd.h | 1 - arch/sh/Kconfig | 2 -- arch/sh/include/asm/unistd.h | 1 - arch/sparc/Kconfig | 2 -- arch/sparc/include/asm/unistd.h | 1 - arch/tile/Kconfig | 2 -- arch/tile/include/asm/unistd.h | 1 - arch/unicore32/Kconfig | 2 -- arch/unicore32/include/uapi/asm/unistd.h | 1 - arch/x86/Kconfig | 2 -- arch/x86/include/asm/unistd.h | 1 - arch/x86/um/Kconfig | 2 -- arch/xtensa/Kconfig | 2 -- arch/xtensa/include/asm/unistd.h | 1 - fs/exec.c | 21 --------------------- include/linux/binfmts.h | 4 ---- include/linux/sched.h | 2 -- include/linux/syscalls.h | 9 --------- init/main.c | 4 +++- kernel/fork.c | 2 -- kernel/kmod.c | 6 +++--- 63 files changed, 6 insertions(+), 131 deletions(-) (limited to 'kernel/fork.c') diff --git a/arch/Kconfig b/arch/Kconfig index 8d698fb5ccc..0a8dd0585d0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -271,12 +271,6 @@ config ARCH_WANT_OLD_COMPAT_IPC select ARCH_WANT_COMPAT_IPC_PARSE_VERSION bool -config GENERIC_KERNEL_THREAD - bool - -config GENERIC_KERNEL_EXECVE - bool - config HAVE_ARCH_SECCOMP_FILTER bool help diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 5dd7f5db24d..7e3710c0cce 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -20,8 +20,6 @@ config ALPHA select GENERIC_CMOS_UPDATE select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA help diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h index eb3a4664ced..68c75f2fadb 100644 --- a/arch/alpha/include/asm/unistd.h +++ b/arch/alpha/include/asm/unistd.h @@ -481,7 +481,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8918a2dd89b..b789654e7e2 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -11,8 +11,6 @@ config ARM select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select GENERIC_PCI_IOMAP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 7cd13cc6262..21a2700d295 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -41,7 +41,6 @@ #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_SOCKETCALL #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4b03c56ec32..a846029bebc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -7,8 +7,6 @@ config ARM64 select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_KERNEL_EXECVE - select GENERIC_KERNEL_THREAD select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HARDIRQS_SW_RESEND diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d69aeea6da1..738322945d1 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -27,6 +27,5 @@ #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 649aeb9acec..06e73bf665e 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -17,8 +17,6 @@ config AVR32 select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help AVR32 is a high-performance 32-bit RISC microprocessor core, designed for cost-sensitive embedded applications, with particular diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h index f05a9804e8e..0bdf6371574 100644 --- a/arch/avr32/include/asm/unistd.h +++ b/arch/avr32/include/asm/unistd.h @@ -39,7 +39,6 @@ #define __ARCH_WANT_SYS_GETPGRP #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index ab9ff4075f4..b6f3ad5441c 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -45,8 +45,6 @@ config BLACKFIN select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config GENERIC_CSUM def_bool y diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index 460514a1a4e..38711e28baa 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -446,7 +446,6 @@ #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_VFORK /* diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 66eab3703c7..f6a3648f5ec 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -17,8 +17,6 @@ config C6X select OF select OF_EARLY_FLATTREE select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config MMU diff --git a/arch/c6x/include/uapi/asm/unistd.h b/arch/c6x/include/uapi/asm/unistd.h index f3987a8703d..e7d09a614d1 100644 --- a/arch/c6x/include/uapi/asm/unistd.h +++ b/arch/c6x/include/uapi/asm/unistd.h @@ -14,7 +14,6 @@ * more details. */ -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE /* Use the standard ABI for syscalls. */ diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 0cac6a49f23..c59a01dd9c0 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -49,8 +49,6 @@ config CRIS select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 select GENERIC_CMOS_UPDATE select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS2 config HZ diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index f27b542e0eb..5cda75a9cc1 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -371,7 +371,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index df2eb4bd9fa..9d262645f66 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -12,8 +12,6 @@ config FRV select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES select ARCH_WANT_IPC_PARSE_VERSION - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config ZONE_DMA bool diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 1807d8ea8cb..d685da17f5f 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 04bef4d25b4..98fabd10e95 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -8,8 +8,6 @@ config H8300 select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SYMBOL_PREFIX string diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h index c2c2f5c7d6b..566f94860c4 100644 --- a/arch/h8300/include/asm/unistd.h +++ b/arch/h8300/include/asm/unistd.h @@ -356,7 +356,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index e418803b6c8..0744f7d7b1f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -31,8 +31,6 @@ config HEXAGON select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE ---help--- Qualcomm Hexagon is a processor architecture designed for high performance and low power across a wide variety of applications. diff --git a/arch/hexagon/include/uapi/asm/unistd.h b/arch/hexagon/include/uapi/asm/unistd.h index 2af81533bd0..4a87cc47075 100644 --- a/arch/hexagon/include/uapi/asm/unistd.h +++ b/arch/hexagon/include/uapi/asm/unistd.h @@ -27,7 +27,6 @@ */ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 67060046812..3279646120e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -42,8 +42,6 @@ config IA64 select GENERIC_TIME_VSYSCALL_OLD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index 1574bca8613..8b3ff2f5b86 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 5183f43a2cf..f807721e19a 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -15,8 +15,6 @@ config M32R select GENERIC_ATOMIC64 select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SBUS bool diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h index d9e7351af2a..cbfa39158fe 100644 --- a/arch/m32r/include/asm/unistd.h +++ b/arch/m32r/include/asm/unistd.h @@ -352,7 +352,6 @@ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 953a7ba5d05..6710084e072 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -15,8 +15,6 @@ config M68K select FPU if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index a021d67cdd7..847994ce680 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -31,7 +31,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 4bcf89148f3..ba3b7c8c04b 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -26,8 +26,6 @@ config MICROBLAZE select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config SWAP diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 94d978986b7..38cabf4db54 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -422,7 +422,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_VFORK #ifdef CONFIG_MMU diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4183e62f178..dba9390d37c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -40,8 +40,6 @@ config MIPS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL select MODULES_USE_ELF_RELA if 64BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE menu "Machine selection" diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index b306e2081ca..9e47cc11aa2 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -20,7 +20,6 @@ #define __ARCH_OMIT_COMPAT_SYS_GETDENTS64 #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 72471744a91..aa03f2e1338 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -8,8 +8,6 @@ config MN10300 select HAVE_ARCH_KGDB select HAVE_NMI_WATCHDOG if MN10300_WD_TIMER select GENERIC_CLOCKEVENTS - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_RELA config AM33_2 diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index cabf8ba73b2..e6d2ed4ba68 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -43,7 +43,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e7f1a2993f7..05f2ba41ff1 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -22,8 +22,6 @@ config OPENRISC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config MMU def_bool y diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 5082b806632..ce40b71df00 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -20,7 +20,6 @@ #define sys_mmap2 sys_mmap_pgoff -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index e688a2be30f..b77feffbade 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -22,8 +22,6 @@ config PARISC select GENERIC_STRNCPY_FROM_USER select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS help diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 1efef41659c..3043194547c 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -163,7 +163,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 951a517a1a0..17903f1f356 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,10 +141,8 @@ config PPC select GENERIC_CLOCKEVENTS select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_KERNEL_THREAD select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS config EARLY_PRINTK diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 76fe846ec40..784872f9371 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -55,7 +55,6 @@ #define __ARCH_WANT_SYS_NEWFSTATAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3cbb8757704..5029ebf7110 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -137,8 +137,6 @@ config S390 select GENERIC_CLOCKEVENTS select KTIME_SCALAR if 32BIT select HAVE_ARCH_SECCOMP_FILTER - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 086bb8eaf6a..63653087251 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -53,7 +53,6 @@ # define __ARCH_WANT_COMPAT_SYS_TIME # define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND # endif -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK #define __ARCH_WANT_SYS_CLONE diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 45893390c7d..3b1482e7afa 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -13,8 +13,6 @@ config SCORE select GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select CLONE_BACKWARDS choice diff --git a/arch/score/include/asm/unistd.h b/arch/score/include/asm/unistd.h index 56001c93095..9cb4260a5f3 100644 --- a/arch/score/include/asm/unistd.h +++ b/arch/score/include/asm/unistd.h @@ -4,7 +4,6 @@ #define __ARCH_WANT_SYSCALL_NO_FLAGS #define __ARCH_WANT_SYSCALL_OFF_T #define __ARCH_WANT_SYSCALL_DEPRECATED -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_VFORK diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 8451317eed5..babc2b826c5 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -40,8 +40,6 @@ config SUPERH select GENERIC_STRNLEN_USER select HAVE_MOD_ARCH_SPECIFIC if DWARF_UNWINDER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index 43d3f26b2ea..012004ed333 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -28,7 +28,6 @@ # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK # define __ARCH_WANT_SYS_RT_SIGACTION -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 0c7d365fa40..9f2edb5c555 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -41,8 +41,6 @@ config SPARC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index c3e5d8b6417..0ecea6ed943 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -46,7 +46,6 @@ #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif -#define __ARCH_WANT_SYS_EXECVE /* * "Conditional" syscalls diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index ea7f61e8bc9..875d008828b 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -21,8 +21,6 @@ config TILE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/tile/include/asm/unistd.h b/arch/tile/include/asm/unistd.h index b51c6ee3cd6..940831fe9e9 100644 --- a/arch/tile/include/asm/unistd.h +++ b/arch/tile/include/asm/unistd.h @@ -16,6 +16,5 @@ #define __ARCH_WANT_SYS_LLSEEK #endif #define __ARCH_WANT_SYS_NEWFSTATAT -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index c4fbb21e802..60651df5f95 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -16,8 +16,6 @@ config UNICORE32 select ARCH_WANT_FRAME_POINTERS select GENERIC_IOMAP select MODULES_USE_ELF_REL - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h index 00cf5e286fc..d4cc4559d84 100644 --- a/arch/unicore32/include/uapi/asm/unistd.h +++ b/arch/unicore32/include/uapi/asm/unistd.h @@ -12,5 +12,4 @@ /* Use the standard ABI for syscalls. */ #include -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0df6e7d8453..01ca0ebaff0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,8 +108,6 @@ config X86 select GENERIC_STRNLEN_USER select HAVE_RCU_USER_QS if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 0e7dea7d366..9dcfcc1d6f9 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -50,7 +50,6 @@ # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME # define __ARCH_WANT_SYS_WAITPID -# define __ARCH_WANT_SYS_EXECVE # define __ARCH_WANT_SYS_FORK # define __ARCH_WANT_SYS_VFORK # define __ARCH_WANT_SYS_CLONE diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 8f51c39750d..0fd20f241e4 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -13,8 +13,6 @@ endmenu config UML_X86 def_bool y select GENERIC_FIND_FIRST_BIT - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE config 64BIT bool "64-bit kernel" if SUBARCH = "x86" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2481f267be2..03a8c107e07 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -13,8 +13,6 @@ config XTENSA select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA select GENERIC_PCI_IOMAP - select GENERIC_KERNEL_THREAD - select GENERIC_KERNEL_EXECVE select ARCH_WANT_OPTIONAL_GPIOLIB select CLONE_BACKWARDS help diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h index e002dbcc88b..eb63ea87815 100644 --- a/arch/xtensa/include/asm/unistd.h +++ b/arch/xtensa/include/asm/unistd.h @@ -1,7 +1,6 @@ #ifndef _XTENSA_UNISTD_H #define _XTENSA_UNISTD_H -#define __ARCH_WANT_SYS_EXECVE #define __ARCH_WANT_SYS_CLONE #include diff --git a/fs/exec.c b/fs/exec.c index 721a2992951..090ac91da2e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1657,7 +1657,6 @@ int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -#ifdef __ARCH_WANT_SYS_EXECVE SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, @@ -1685,23 +1684,3 @@ asmlinkage long compat_sys_execve(const char __user * filename, return error; } #endif -#endif - -#ifdef __ARCH_WANT_KERNEL_EXECVE -int kernel_execve(const char *filename, - const char *const argv[], - const char *const envp[]) -{ - int ret = do_execve(filename, - (const char __user *const __user *)argv, - (const char __user *const __user *)envp); - if (ret < 0) - return ret; - - /* - * We were successful. We won't be returning to our caller, but - * instead to user space by manipulating the kernel stack. - */ - ret_from_kernel_execve(current_pt_regs()); -} -#endif diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 2630c9b41a8..8c1388c6ae2 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -121,8 +121,4 @@ extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); extern void free_bprm(struct linux_binprm *); -#ifdef __ARCH_WANT_KERNEL_EXECVE -extern void ret_from_kernel_execve(struct pt_regs *normal) __noreturn; -#endif - #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1162258bcaf..9e5a54e3d84 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,9 +2291,7 @@ extern int do_execve(const char *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); -#ifdef CONFIG_GENERIC_KERNEL_THREAD extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -#endif extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 91835e7f364..9fe5f946526 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -827,15 +827,6 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, const char __user *pathname); asmlinkage long sys_syncfs(int fd); -#ifndef CONFIG_GENERIC_KERNEL_EXECVE -int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]); -#else -#define kernel_execve(filename, argv, envp) \ - do_execve(filename, \ - (const char __user *const __user *)argv, \ - (const char __user *const __user *)envp) -#endif - asmlinkage long sys_fork(void); asmlinkage long sys_vfork(void); #ifdef CONFIG_CLONE_BACKWARDS diff --git a/init/main.c b/init/main.c index e33e09df3cb..155ac208d58 100644 --- a/init/main.c +++ b/init/main.c @@ -797,7 +797,9 @@ static void __init do_pre_smp_initcalls(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return kernel_execve(init_filename, argv_init, envp_init); + return do_execve(init_filename, + (const char __user *const __user *)argv_init, + (const char __user *const __user *)envp_init); } static void __init kernel_init_freeable(void); diff --git a/kernel/fork.c b/kernel/fork.c index 54073078343..389712ffc0a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1623,7 +1623,6 @@ long do_fork(unsigned long clone_flags, return nr; } -#ifdef CONFIG_GENERIC_KERNEL_THREAD /* * Create a kernel thread. */ @@ -1632,7 +1631,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, (unsigned long)arg, NULL, NULL); } -#endif #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) diff --git a/kernel/kmod.c b/kernel/kmod.c index 1c317e38683..0023a87e8de 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -219,9 +219,9 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = kernel_execve(sub_info->path, - (const char *const *)sub_info->argv, - (const char *const *)sub_info->envp); + retval = do_execve(sub_info->path, + (const char __user *const __user *)sub_info->argv, + (const char __user *const __user *)sub_info->envp); if (!retval) return 0; -- cgit v1.2.3-70-g09d2 From 8382fcac1b813ad0a4e68a838fc7ae93fa39eda0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Dec 2012 19:26:06 -0800 Subject: pidns: Outlaw thread creation after unshare(CLONE_NEWPID) The sequence: unshare(CLONE_NEWPID) clone(CLONE_THREAD|CLONE_SIGHAND|CLONE_VM) Creates a new process in the new pid namespace without setting pid_ns->child_reaper. After forking this results in a NULL pointer dereference. Avoid this and other nonsense scenarios that can show up after creating a new pid namespace with unshare by adding a new check in copy_prodcess. Pointed-out-by: Oleg Nesterov Acked-by: Oleg Nesterov Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a31b823b3c2..65ca6d27f24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1166,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); + /* + * If the new process will be in a different pid namespace + * don't allow the creation of threads. + */ + if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + (task_active_pid_ns(current) != current->nsproxy->pid_ns)) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; -- cgit v1.2.3-70-g09d2