diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 219 |
1 files changed, 154 insertions, 65 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636..e7ceaca8960 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,7 +37,6 @@ #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> -#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> @@ -59,7 +58,6 @@ #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> -#include <linux/proc_fs.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@ -82,7 +80,7 @@ * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ -int nr_threads; /* The idle threads do not count.. */ +int nr_threads; /* The idle threads do not count.. */ int max_threads; /* tunable limit on nr_threads */ @@ -234,7 +232,7 @@ void __init fork_init(unsigned long mempages) /* * we need to allow at least 20 threads to boot a system */ - if(max_threads < 20) + if (max_threads < 20) max_threads = 20; init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; @@ -270,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - err = arch_dup_task_struct(tsk, orig); + err = arch_dup_task_struct(tsk, orig); if (err) goto out; @@ -290,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack_canary = get_random_int(); #endif - /* One for us, one for whoever does the "release_task()" (usually parent) */ - atomic_set(&tsk->usage,2); - atomic_set(&tsk->fs_excl, 0); + /* + * One for us, one for whoever does the "release_task()" (usually + * parent) + */ + atomic_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -441,7 +440,7 @@ fail_nomem: goto out; } -static inline int mm_alloc_pgd(struct mm_struct * mm) +static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) @@ -449,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) return 0; } -static inline void mm_free_pgd(struct mm_struct * mm) +static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } @@ -486,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm) #endif } -static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) +static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); @@ -517,16 +516,17 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) /* * Allocate and initialize an mm_struct. */ -struct mm_struct * mm_alloc(void) +struct mm_struct *mm_alloc(void) { - struct mm_struct * mm; + struct mm_struct *mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); - } - return mm; + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm_init_cpumask(mm); + return mm_init(mm, current); } /* @@ -573,6 +573,57 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + /** * get_task_mm - acquire a reference to the task's mm * @@ -679,6 +730,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); + mm_init_cpumask(mm); /* Initializing for Swap token stuff */ mm->token_priority = 0; @@ -726,9 +778,9 @@ fail_nocontext: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { - struct mm_struct * mm, *oldmm; + struct mm_struct *mm, *oldmm; int retval; tsk->min_flt = tsk->maj_flt = 0; @@ -795,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk) { struct files_struct *oldf, *newf; int error = 0; @@ -927,6 +979,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -958,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES - plist_head_init_raw(&p->pi_waiters, &p->pi_lock); + plist_head_init(&p->pi_waiters); p->pi_blocked_on = NULL; #endif } @@ -1103,22 +1159,27 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); - if (IS_ERR(p->mempolicy)) { - retval = PTR_ERR(p->mempolicy); - p->mempolicy = NULL; - goto bad_fork_cleanup_cgroup; - } + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cgroup; + } mpol_fix_fork_child_flag(p); #endif +#ifdef CONFIG_CPUSETS + p->cpuset_mem_spread_rotor = NUMA_NO_NODE; + p->cpuset_slab_spread_rotor = NUMA_NO_NODE; +#endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW @@ -1153,30 +1214,38 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); + sched_fork(p); retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; - - if ((retval = audit_alloc(p))) + retval = audit_alloc(p); + if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ - if ((retval = copy_semundo(clone_flags, p))) + retval = copy_semundo(clone_flags, p); + if (retval) goto bad_fork_cleanup_audit; - if ((retval = copy_files(clone_flags, p))) + retval = copy_files(clone_flags, p); + if (retval) goto bad_fork_cleanup_semundo; - if ((retval = copy_fs(clone_flags, p))) + retval = copy_fs(clone_flags, p); + if (retval) goto bad_fork_cleanup_files; - if ((retval = copy_sighand(clone_flags, p))) + retval = copy_sighand(clone_flags, p); + if (retval) goto bad_fork_cleanup_fs; - if ((retval = copy_signal(clone_flags, p))) + retval = copy_signal(clone_flags, p); + if (retval) goto bad_fork_cleanup_sighand; - if ((retval = copy_mm(clone_flags, p))) + retval = copy_mm(clone_flags, p); + if (retval) goto bad_fork_cleanup_signal; - if ((retval = copy_namespaces(clone_flags, p))) + retval = copy_namespaces(clone_flags, p); + if (retval) goto bad_fork_cleanup_mm; - if ((retval = copy_io(clone_flags, p))) + retval = copy_io(clone_flags, p); + if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); if (retval) @@ -1194,17 +1263,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1272,7 +1335,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * it's process group. * A fatal signal pending means that current will exit, so the new * thread can't slip out of an OOM kill (or normal SIGKILL). - */ + */ recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); @@ -1290,7 +1353,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (likely(p->pid)) { - tracehook_finish_clone(p, clone_flags, trace); + ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); if (thread_group_leader(p)) { if (is_child_reaper(pid)) @@ -1313,6 +1376,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; @@ -1351,6 +1416,8 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); @@ -1427,10 +1494,22 @@ long do_fork(unsigned long clone_flags, } /* - * When called from kernel_thread, don't do user tracing stuff. + * Determine whether and which event to report to ptracer. When + * called from kernel_thread or CLONE_UNTRACED is explicitly + * requested, no event is reported; otherwise, report if the event + * for the type of forking is enabled. */ - if (likely(user_mode(regs))) - trace = tracehook_prepare_clone(clone_flags); + if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { + if (clone_flags & CLONE_VFORK) + trace = PTRACE_EVENT_VFORK; + else if ((clone_flags & CSIGNAL) != SIGCHLD) + trace = PTRACE_EVENT_CLONE; + else + trace = PTRACE_EVENT_FORK; + + if (likely(!ptrace_event_enabled(current, trace))) + trace = 0; + } p = copy_process(clone_flags, stack_start, regs, stack_size, child_tidptr, NULL, trace); @@ -1454,26 +1533,26 @@ long do_fork(unsigned long clone_flags, } audit_finish_fork(p); - tracehook_report_clone(regs, clone_flags, nr, p); /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that - * hasn't gotten to tracehook_report_clone() yet. Now we - * clear it and set the child going. + * hasn't finished SIGSTOP raising yet. Now we clear it + * and set the child going. */ p->flags &= ~PF_STARTING; - wake_up_new_task(p, clone_flags); + wake_up_new_task(p); - tracehook_report_clone_complete(trace, regs, - clone_flags, nr, p); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - tracehook_report_vfork_done(p, nr); + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); @@ -1508,11 +1587,19 @@ void __init proc_caches_init(void) fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + /* + * FIXME! The "sizeof(struct mm_struct)" currently includes the + * whole struct cpumask for the OFFSTACK case. We could change + * this to *only* allocate as much of it as required by the + * maximum number of CPU's we can ever have. The cpumask_allocation + * is at the end of the structure, exactly for that reason. + */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); mmap_init(); + nsproxy_cache_init(); } /* @@ -1609,12 +1696,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_fs(unshare_flags, &new_fs))) + err = unshare_fs(unshare_flags, &new_fs); + if (err) goto bad_unshare_out; - if ((err = unshare_fd(unshare_flags, &new_fd))) + err = unshare_fd(unshare_flags, &new_fd); + if (err) goto bad_unshare_cleanup_fs; - if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, - new_fs))) + err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); + if (err) goto bad_unshare_cleanup_fd; if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |