diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 4 | ||||
-rw-r--r-- | kernel/audit_tree.c | 91 | ||||
-rw-r--r-- | kernel/auditfilter.c | 14 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 19 | ||||
-rw-r--r-- | kernel/cpuset.c | 12 | ||||
-rw-r--r-- | kernel/exit.c | 9 | ||||
-rw-r--r-- | kernel/fork.c | 11 | ||||
-rw-r--r-- | kernel/hrtimer.c | 9 | ||||
-rw-r--r-- | kernel/kprobes.c | 23 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 7 | ||||
-rw-r--r-- | kernel/sched.c | 18 | ||||
-rw-r--r-- | kernel/sched_debug.c | 5 | ||||
-rw-r--r-- | kernel/sched_stats.h | 15 | ||||
-rw-r--r-- | kernel/stop_machine.c | 5 |
14 files changed, 149 insertions, 93 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..19fad003b19 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg +CFLAGS_REMOVE_sched.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8ba0e0d934f..8b509441f49 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,6 +24,7 @@ struct audit_chunk { struct list_head trees; /* with root here */ int dead; int count; + atomic_long_t refs; struct rcu_head head; struct node { struct list_head list; @@ -56,7 +57,8 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch. + * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; + atomic_long_set(&chunk->refs, 1); for (i = 0; i < count; i++) { INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; @@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count) return chunk; } -static void __free_chunk(struct rcu_head *rcu) +static void free_chunk(struct audit_chunk *chunk) { - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); int i; for (i = 0; i < chunk->count; i++) { @@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu) kfree(chunk); } -static inline void free_chunk(struct audit_chunk *chunk) +void audit_put_chunk(struct audit_chunk *chunk) { - call_rcu(&chunk->head, __free_chunk); + if (atomic_long_dec_and_test(&chunk->refs)) + free_chunk(chunk); } -void audit_put_chunk(struct audit_chunk *chunk) +static void __put_chunk(struct rcu_head *rcu) { - put_inotify_watch(&chunk->watch); + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); + audit_put_chunk(chunk); } enum {HASH_SIZE = 128}; @@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { - get_inotify_watch(&p->watch); + atomic_long_inc(&p->refs); return p; } } @@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) /* tagging and untagging inodes with trees */ -static void untag_chunk(struct audit_chunk *chunk, struct node *p) +static struct audit_chunk *find_chunk(struct node *p) +{ + int index = p->index & ~(1U<<31); + p -= index; + return container_of(p, struct audit_chunk, owners[0]); +} + +static void untag_chunk(struct node *p) { + struct audit_chunk *chunk = find_chunk(p); struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; + if (!pin_inotify_watch(&chunk->watch)) { + /* + * Filesystem is shutting down; all watches are getting + * evicted, just take it off the node list for this + * tree and let the eviction logics take care of the + * rest. + */ + owner = p->owner; + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); + return; + } + + spin_unlock(&hash_lock); + + /* + * pin_inotify_watch() succeeded, so the watch won't go away + * from under us. + */ mutex_lock(&chunk->watch.inode->inotify_mutex); if (chunk->dead) { mutex_unlock(&chunk->watch.inode->inotify_mutex); - return; + goto out; } owner = p->owner; @@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; } new = alloc_chunk(size); @@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; Fallback: // do the best we can @@ -277,6 +313,9 @@ Fallback: put_tree(owner); spin_unlock(&hash_lock); mutex_unlock(&chunk->watch.inode->inotify_mutex); +out: + unpin_inotify_watch(&chunk->watch); + spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) @@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; @@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim) spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; - struct audit_chunk *chunk; p = list_entry(victim->chunks.next, struct node, list); - chunk = find_chunk(p); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, p); - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(p); } spin_unlock(&hash_lock); put_tree(victim); @@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree) while (!list_empty(&tree->chunks)) { struct node *node; - struct audit_chunk *chunk; node = list_entry(tree->chunks.next, struct node, list); @@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree) if (!(node->index & (1U<<31))) break; - chunk = find_chunk(node); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, node); - - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(node); } if (!tree->root && !tree->goner) { tree->goner = 1; @@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, static void destroy_watch(struct inotify_watch *watch) { struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - free_chunk(chunk); + call_rcu(&chunk->head, __put_chunk); } static const struct inotify_operations rtree_inotify_ops = { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b7d354e2b0e..9fd85a4640a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list) list_for_each_entry_safe(p, n, in_list, ilist) { list_del(&p->ilist); inotify_rm_watch(audit_ih, &p->wdata); - /* the put matching the get in audit_do_del_rule() */ - put_inotify_watch(&p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); } } @@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry, /* Put parent on the inotify un-registration * list. Grab a reference before releasing * audit_filter_mutex, to be released in - * audit_inotify_unregister(). */ - list_add(&parent->ilist, &inotify_list); - get_inotify_watch(&parent->wdata); + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. + */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, &inotify_list); } } } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7fa476f01d0..fb249e2bcad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - task_lock(task); + /* + * No lock is needed, since the task isn't on tasklist yet, + * so it can't be moved to another cgroup, which means the + * freezer won't be removed and will be valid during this + * function call. + */ freezer = task_freezer(task); - task_unlock(task); + + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) + return; spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup, else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) goal_state = CGROUP_FROZEN; else - return -EIO; + return -EINVAL; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; @@ -350,6 +361,8 @@ static struct cftype files[] = { static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) { + if (!cgroup->parent) + return 0; return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e00526f52e..81fc6791a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -587,7 +587,6 @@ static int generate_sched_domains(cpumask_t **domains, int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; @@ -674,10 +673,8 @@ restart: * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) { - ndoms = 0; + if (!doms) goto done; - } /* * The rest of the code, including the scheduler, can deal with @@ -732,6 +729,13 @@ restart: done: kfree(csa); + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + *domains = doms; *attributes = dattr; return ndoms; diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fa..2d8be7ebb0f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -40,7 +40,6 @@ #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> -#include <linux/compat.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> @@ -1059,14 +1058,6 @@ NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) diff --git a/kernel/fork.c b/kernel/fork.c index f6083561dfe..2a372a0e206 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include <linux/jiffies.h> #include <linux/tracehook.h> #include <linux/futex.h> +#include <linux/compat.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> @@ -519,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 95d3949f2ae..47e63349d1b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* Timer is expired, act upon the callback mode */ switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_NO_RESTART: - debug_hrtimer_deactivate(timer); - /* - * We can call the callback from here. No restart - * happens, so no danger of recursion - */ - BUG_ON(timer->function(timer) != HRTIMER_NORESTART); - return 1; case HRTIMER_CB_IRQSAFE_PERCPU: case HRTIMER_CB_IRQSAFE_UNLOCKED: /* @@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ debug_hrtimer_deactivate(timer); return 1; - case HRTIMER_CB_IRQSAFE: case HRTIMER_CB_SOFTIRQ: /* * Move everything else into the softirq pending list ! diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8b57a2597f2..9f8a3f25259 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,7 +72,7 @@ static bool kprobe_enabled; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned; + spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) @@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = addr; - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) + preempt_disable(); + if (!__kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 0; /* * Check if are we probing a module. */ - probed_mod = module_text_address((unsigned long) p->addr); + probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod = module_text_address(called_from); + struct module *calling_mod; + calling_mod = __module_text_address(called_from); /* * We must allow modules to probe themself and in this case * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 1; } else probed_mod = NULL; } + preempt_enable(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) struct kprobe *old_p; if (p->mod_refcounted) { + /* + * Since we've already incremented refcount, + * we don't need to disable preemption. + */ mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 153dcb2639c..895337b16a2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample, */ static inline int fastpath_timer_check(struct task_struct *tsk) { - struct signal_struct *sig = tsk->signal; + struct signal_struct *sig; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) return 0; if (!task_cputime_zero(&tsk->cputime_expires)) { @@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } + + sig = tsk->signal; if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f96467..9b1e79371c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1456,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu) if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; + else + rq->avg_load_per_task = 0; return rq->avg_load_per_task; } @@ -5868,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + spin_lock_irqsave(&rq->lock, flags); + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -5875,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; @@ -7786,13 +7789,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * The passed in 'doms_new' should be kmalloc'd. This routine takes * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. * - * If doms_new==NULL it will be replaced with cpu_online_map. - * ndoms_new==0 is a special case for destroying existing domains. - * It will not create the default domain. + * If doms_new == NULL it will be replaced with cpu_online_map. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. * * Call with hotplug lock held */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 48ecc51e770..26ed8e3d1c1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -423,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #undef __P { + unsigned int this_cpu = raw_smp_processor_id(); u64 t0, t1; - t0 = sched_clock(); - t1 = sched_clock(); + t0 = cpu_clock(this_cpu); + t1 = cpu_clock(this_cpu); SEQ_printf(m, "%-35s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index ee71bec1da6..7dbf72a2b02 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, struct signal_struct *sig; sig = tsk->signal; + /* see __exit_signal()->task_rq_unlock_wait() */ + barrier(); if (unlikely(!sig)) return; + if (sig->cputime.totals) { struct task_cputime *times; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9bc4c00872c..24e8ceacc38 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -112,7 +112,7 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { struct work_struct *sm_work; - int i; + int i, ret; /* Set up initial state. */ mutex_lock(&lock); @@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) /* This will release the thread on our CPU. */ put_cpu(); flush_workqueue(stop_machine_wq); + ret = active.fnret; mutex_unlock(&lock); - return active.fnret; + return ret; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) |