diff options
Diffstat (limited to 'kernel')
41 files changed, 2107 insertions, 1191 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f1849..156cc555614 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { - struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -198,7 +197,6 @@ struct take_cpu_down_param { static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; - unsigned int cpu = (unsigned long)param->hcpu; int err; /* Ensure this CPU doesn't handle any more interrupts. */ @@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param) cpu_notify(CPU_DYING | param->mod, param->hcpu); - if (task_cpu(param->caller) == cpu) - move_task_off_dead_cpu(cpu, param->caller); - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); return 0; } @@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { - .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } BUG_ON(cpu_online(cpu)); - /* Wait for it to sleep (leaving idle task). */ + /* + * The migration_call() CPU_DYING callback will have removed all + * runnable tasks from the cpu, there's only the idle task left now + * that the migration thread is done doing the stop_machine thing. + * + * Wait for the stop thread to go away. + */ while (!idle_cpu(cpu)) - yield(); + cpu_relax(); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -386,6 +384,14 @@ out: #ifdef CONFIG_PM_SLEEP_SMP static cpumask_var_t frozen_cpus; +void __weak arch_disable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_disable_nonboot_cpus_end(void) +{ +} + int disable_nonboot_cpus(void) { int cpu, first_cpu, error = 0; @@ -397,6 +403,7 @@ int disable_nonboot_cpus(void) * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); + arch_disable_nonboot_cpus_begin(); printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { @@ -412,6 +419,8 @@ int disable_nonboot_cpus(void) } } + arch_disable_nonboot_cpus_end(); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ diff --git a/kernel/fork.c b/kernel/fork.c index 5447dc7defa..7d164e25b0f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (atomic_dec_and_test(&sig->sigcnt)) { + sched_autogroup_exit(sig); free_signal_struct(sig); + } } void __put_task_struct(struct task_struct *tsk) @@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) posix_cpu_timers_init_group(sig); tty_audit_fork(sig); + sched_autogroup_fork(sig); sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; @@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm: } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - free_signal_struct(p->signal); + put_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: diff --git a/kernel/futex.c b/kernel/futex.c index 40a8777a27d..5696d38cc71 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -69,6 +69,14 @@ int __read_mostly futex_cmpxchg_enabled; #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* * Priority Inheritance state: */ struct futex_pi_state { @@ -123,6 +131,12 @@ struct futex_q { u32 bitset; }; +static const struct futex_q futex_q_init = { + /* list gets initialized in queue_me()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY +}; + /* * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task @@ -283,8 +297,7 @@ again: return 0; } -static inline -void put_futex_key(int fshared, union futex_key *key) +static inline void put_futex_key(union futex_key *key) { drop_futex_key_refs(key); } @@ -778,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; @@ -870,7 +882,8 @@ double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) /* * Wake up waiters matching bitset queued on this futex (uaddr). */ -static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -881,7 +894,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -907,7 +920,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) } spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; } @@ -917,7 +930,7 @@ out: * to this virtual address: */ static int -futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -927,10 +940,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -962,11 +975,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } @@ -996,9 +1009,9 @@ retry_private: double_unlock_hb(hb1, hb2); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: return ret; } @@ -1133,13 +1146,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @flags: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) * @requeue_pi: if we are attempting to requeue from a non-pi futex to a - * pi futex (pi to pi requeue is not supported) + * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. @@ -1148,9 +1161,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * >=0 - on success, the number of tasks requeued or woken * <0 - on error */ -static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval, - int requeue_pi) +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; int drop_count = 0, task_count = 0, ret; @@ -1191,10 +1204,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1216,11 +1229,11 @@ retry_private: if (ret) goto out_put_keys; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); goto retry; } if (curval != *cmpval) { @@ -1260,8 +1273,8 @@ retry_private: break; case -EFAULT: double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; @@ -1269,8 +1282,8 @@ retry_private: case -EAGAIN: /* The owner was exiting, try again. */ double_unlock_hb(hb1, hb2); - put_futex_key(fshared, &key2); - put_futex_key(fshared, &key1); + put_futex_key(&key2); + put_futex_key(&key1); cond_resched(); goto retry; default: @@ -1352,9 +1365,9 @@ out_unlock: drop_futex_key_refs(&key1); out_put_keys: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out_put_key1: - put_futex_key(fshared, &key1); + put_futex_key(&key1); out: if (pi_state != NULL) free_pi_state(pi_state); @@ -1494,7 +1507,7 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner, int fshared) + struct task_struct *newowner) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; @@ -1587,20 +1600,11 @@ handle_fault: goto retry; } -/* - * In case we must use restart_block to restart a futex_wait, - * we encode in the 'flags' shared capability - */ -#define FLAGS_SHARED 0x01 -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 - static long futex_wait_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management * @uaddr: user address of the futex - * @fshared: whether the futex is shared (1) or not (0) * @q: futex_q (contains pi_state and access to the rt_mutex) * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) * @@ -1613,8 +1617,7 @@ static long futex_wait_restart(struct restart_block *restart); * 0 - success, lock not taken * <0 - on error (-EFAULT) */ -static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, - int locked) +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { struct task_struct *owner; int ret = 0; @@ -1625,7 +1628,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * did a lock-steal - fix up the PI-state in that case: */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current, fshared); + ret = fixup_pi_state_owner(uaddr, q, current); goto out; } @@ -1652,7 +1655,7 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, * lock. Fix the state up. */ owner = rt_mutex_owner(&q->pi_state->pi_mutex); - ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -1715,7 +1718,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value - * @fshared: whether the futex is shared (1) or not (0) + * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * @@ -1728,7 +1731,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * 0 - uaddr contains val and hb has been locked * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked */ -static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; @@ -1752,8 +1755,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, * rare, but normal. */ retry: - q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); if (unlikely(ret != 0)) return ret; @@ -1769,10 +1771,10 @@ retry_private: if (ret) goto out; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); goto retry; } @@ -1783,32 +1785,29 @@ retry_private: out: if (ret) - put_futex_key(fshared, &q->key); + put_futex_key(&q->key); return ret; } -static int futex_wait(u32 __user *uaddr, int fshared, - u32 val, ktime_t *abs_time, u32 bitset, int clockrt) +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to = NULL; struct restart_block *restart; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; - - q.pi_state = NULL; q.bitset = bitset; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -1819,7 +1818,7 @@ retry: * Prepare to wait on uaddr. On success, holds hb lock and increments * q.key refs. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out; @@ -1852,12 +1851,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = FLAGS_HAS_TIMEOUT; - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + restart->futex.flags = flags; ret = -ERESTART_RESTARTBLOCK; @@ -1873,7 +1867,6 @@ out: static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; - int fshared = 0; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { @@ -1881,11 +1874,9 @@ static long futex_wait_restart(struct restart_block *restart) tp = &t; } restart->fn = do_no_restart_syscall; - if (restart->futex.flags & FLAGS_SHARED) - fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, - restart->futex.bitset, - restart->futex.flags & FLAGS_CLOCKRT); + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); } @@ -1895,12 +1886,12 @@ static long futex_wait_restart(struct restart_block *restart) * if there are waiters then it will block, it does PI, etc. (Due to * races the kernel might see a 0 value of the futex too.) */ -static int futex_lock_pi(u32 __user *uaddr, int fshared, - int detect, ktime_t *time, int trylock) +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, + ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - struct futex_q q; + struct futex_q q = futex_q_init; int res, ret; if (refill_pi_state_cache()) @@ -1914,12 +1905,8 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, hrtimer_set_expires(&to->timer, *time); } - q.pi_state = NULL; - q.rt_waiter = NULL; - q.requeue_pi_key = NULL; retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); if (unlikely(ret != 0)) goto out; @@ -1941,7 +1928,7 @@ retry_private: * exit to complete. */ queue_unlock(&q, hb); - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); cond_resched(); goto retry; default: @@ -1971,7 +1958,7 @@ retry_private: * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr, fshared, &q, !ret); + res = fixup_owner(uaddr, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it acquired * the lock, clear our -ETIMEDOUT or -EINTR. @@ -1995,7 +1982,7 @@ out_unlock_put_key: queue_unlock(&q, hb); out_put_key: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out: if (to) destroy_hrtimer_on_stack(&to->timer); @@ -2008,10 +1995,10 @@ uaddr_faulted: if (ret) goto out_put_key; - if (!fshared) + if (!(flags & FLAGS_SHARED)) goto retry_private; - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); goto retry; } @@ -2020,7 +2007,7 @@ uaddr_faulted: * This is the in-kernel slowpath: we look up the PI state (if any), * and do the rt-mutex unlock. */ -static int futex_unlock_pi(u32 __user *uaddr, int fshared) +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -2038,7 +2025,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); if (unlikely(ret != 0)) goto out; @@ -2093,14 +2080,14 @@ retry: out_unlock: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); out: return ret; pi_faulted: spin_unlock(&hb->lock); - put_futex_key(fshared, &key); + put_futex_key(&key); ret = fault_in_user_writeable(uaddr); if (!ret) @@ -2160,7 +2147,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 * @uaddr: the futex we initially wait on (non-pi) - * @fshared: whether the futexes are shared (1) or not (0). They must be + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout @@ -2198,16 +2185,16 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 0 - On success * <0 - On error */ -static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, - int clockrt, u32 __user *uaddr2) + u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; - union futex_key key2; - struct futex_q q; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; int res, ret; if (!bitset) @@ -2215,8 +2202,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (abs_time) { to = &timeout; - hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); hrtimer_set_expires_range_ns(&to->timer, *abs_time, current->timer_slack_ns); @@ -2229,12 +2217,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); if (unlikely(ret != 0)) goto out; - q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; @@ -2243,7 +2229,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Prepare to wait on uaddr. On success, increments q.key (key1) ref * count. */ - ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) goto out_key2; @@ -2273,8 +2259,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, */ if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); - ret = fixup_pi_state_owner(uaddr2, &q, current, - fshared); + ret = fixup_pi_state_owner(uaddr2, &q, current); spin_unlock(q.lock_ptr); } } else { @@ -2293,7 +2278,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ - res = fixup_owner(uaddr2, fshared, &q, !ret); + res = fixup_owner(uaddr2, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it * acquired the lock, clear -ETIMEDOUT or -EINTR. @@ -2324,9 +2309,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } out_put_keys: - put_futex_key(fshared, &q.key); + put_futex_key(&q.key); out_key2: - put_futex_key(fshared, &key2); + put_futex_key(&key2); out: if (to) { @@ -2551,58 +2536,57 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int clockrt, ret = -ENOSYS; - int cmd = op & FUTEX_CMD_MASK; - int fshared = 0; + int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) - fshared = 1; + flags |= FLAGS_SHARED; - clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) - return -ENOSYS; + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt); + ret = futex_wait(uaddr, flags, val, timeout, val3); break; case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, fshared, val, val3); + ret = futex_wake(uaddr, flags, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 0); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); break; case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); + ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + ret = futex_lock_pi(uaddr, flags, val, timeout, 0); break; case FUTEX_UNLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, fshared); + ret = futex_unlock_pi(uaddr, flags); break; case FUTEX_TRYLOCK_PI: if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); break; case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, - clockrt, uaddr2); + ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); break; case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, - 1); + ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); break; default: ret = -ENOSYS; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6c..f2429fc3438 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -516,10 +516,13 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); /* * clock_was_set() has changed base->offset so the @@ -840,48 +843,17 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - struct rb_node **link = &base->active.rb_node; - struct rb_node *parent = NULL; - struct hrtimer *entry; - int leftmost = 1; - debug_activate(timer); - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct hrtimer, node); - /* - * We dont care about collisions. Nodes with - * the same expiry time stay together. - */ - if (hrtimer_get_expires_tv64(timer) < - hrtimer_get_expires_tv64(entry)) { - link = &(*link)->rb_left; - } else { - link = &(*link)->rb_right; - leftmost = 0; - } - } - - /* - * Insert the timer to the rbtree and check whether it - * replaces the first pending timer - */ - if (leftmost) - base->first = &timer->node; + timerqueue_add(&base->active, &timer->node); - rb_link_node(&timer->node, parent, link); - rb_insert_color(&timer->node, &base->active); /* * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the * state of a possibly running callback. */ timer->state |= HRTIMER_STATE_ENQUEUED; - return leftmost; + return (&timer->node == base->active.next); } /* @@ -901,12 +873,7 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!(timer->state & HRTIMER_STATE_ENQUEUED)) goto out; - /* - * Remove the timer from the rbtree and replace the first - * entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); + if (&timer->node == timerqueue_getnext(&base->active)) { #ifdef CONFIG_HIGH_RES_TIMERS /* Reprogram the clock event device. if enabled */ if (reprogram && hrtimer_hres_active()) { @@ -919,7 +886,7 @@ static void __remove_hrtimer(struct hrtimer *timer, } #endif } - rb_erase(&timer->node, &base->active); + timerqueue_del(&base->active, &timer->node); out: timer->state = newstate; } @@ -1128,11 +1095,13 @@ ktime_t hrtimer_get_next_event(void) if (!hrtimer_hres_active()) { for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; + struct timerqueue_node *next; - if (!base->first) + next = timerqueue_getnext(&base->active); + if (!next) continue; - timer = rb_entry(base->first, struct hrtimer, node); + timer = container_of(next, struct hrtimer, node); delta.tv64 = hrtimer_get_expires_tv64(timer); delta = ktime_sub(delta, base->get_time()); if (delta.tv64 < mindelta.tv64) @@ -1162,6 +1131,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, timer->base = &cpu_base->clock_base[clock_id]; hrtimer_init_timer_hres(timer); + timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -1278,14 +1248,14 @@ retry: for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; - struct rb_node *node; + struct timerqueue_node *node; basenow = ktime_add(now, base->offset); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is @@ -1441,7 +1411,7 @@ void hrtimer_run_pending(void) */ void hrtimer_run_queues(void) { - struct rb_node *node; + struct timerqueue_node *node; struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); struct hrtimer_clock_base *base; int index, gettime = 1; @@ -1451,8 +1421,7 @@ void hrtimer_run_queues(void) for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { base = &cpu_base->clock_base[index]; - - if (!base->first) + if (!timerqueue_getnext(&base->active)) continue; if (gettime) { @@ -1462,10 +1431,10 @@ void hrtimer_run_queues(void) raw_spin_lock(&cpu_base->lock); - while ((node = base->first)) { + while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; - timer = rb_entry(node, struct hrtimer, node); + timer = container_of(node, struct hrtimer, node); if (base->softirq_time.tv64 <= hrtimer_get_expires_tv64(timer)) break; @@ -1630,8 +1599,10 @@ static void __cpuinit init_hrtimers_cpu(int cpu) raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } hrtimer_init_hres(cpu_base); } @@ -1642,10 +1613,10 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; - struct rb_node *node; + struct timerqueue_node *node; - while ((node = rb_first(&old_base->active))) { - timer = rb_entry(node, struct hrtimer, node); + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e5325825aeb..086adf25a55 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -641,7 +641,7 @@ int __init init_hw_breakpoint(void) constraints_initialized = 1; - perf_pmu_register(&perf_breakpoint); + perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5f92acc5f95..91a5fa25054 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } */ static int irq_thread(void *data) { - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + static struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); int wake, oneshot = desc->status & IRQ_ONESHOT; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9737a76e106..7663e5df0e6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -354,13 +354,20 @@ static inline int kprobe_aggrprobe(struct kprobe *p) return p->pre_handler == aggr_pre_handler; } +/* Return true(!0) if the kprobe is unused */ +static inline int kprobe_unused(struct kprobe *p) +{ + return kprobe_aggrprobe(p) && kprobe_disabled(p) && + list_empty(&p->list); +} + /* * Keep all fields in the kprobe consistent */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) { - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); + memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); } #ifdef CONFIG_OPTPROBES @@ -384,6 +391,17 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) } } +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + arch_remove_kprobe(p); + kfree(op); +} + /* Return true(!0) if the kprobe is ready for optimization. */ static inline int kprobe_optready(struct kprobe *p) { @@ -397,6 +415,33 @@ static inline int kprobe_optready(struct kprobe *p) return 0; } +/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ +static inline int kprobe_disarmed(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ + if (!kprobe_aggrprobe(p)) + return kprobe_disabled(p); + + op = container_of(p, struct optimized_kprobe, kp); + + return kprobe_disabled(p) && list_empty(&op->list); +} + +/* Return true(!0) if the probe is queued on (un)optimizing lists */ +static int __kprobes kprobe_queued(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + return 1; + } + return 0; +} + /* * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). @@ -422,30 +467,23 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); +static LIST_HEAD(unoptimizing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +static DECLARE_COMPLETION(optimizer_comp); #define OPTIMIZE_DELAY 5 -/* Kprobe jump optimizer */ -static __kprobes void kprobe_optimizer(struct work_struct *work) +/* + * Optimize (replace a breakpoint with a jump) kprobes listed on + * optimizing_list. + */ +static __kprobes void do_optimize_kprobes(void) { - struct optimized_kprobe *op, *tmp; - - /* Lock modules while optimizing kprobes */ - mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); - if (kprobes_all_disarmed || !kprobes_allow_optimization) - goto end; - - /* - * Wait for quiesence period to ensure all running interrupts - * are done. Because optprobe may modify multiple instructions - * there is a chance that Nth instruction is interrupted. In that - * case, running interrupt can return to 2nd-Nth byte of jump - * instruction. This wait is for avoiding it. - */ - synchronize_sched(); + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; /* * The optimization/unoptimization refers online_cpus via @@ -459,17 +497,111 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) */ get_online_cpus(); mutex_lock(&text_mutex); - list_for_each_entry_safe(op, tmp, &optimizing_list, list) { - WARN_ON(kprobe_disabled(&op->kp)); - if (arch_optimize_kprobe(op) < 0) - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - list_del_init(&op->list); + arch_optimize_kprobes(&optimizing_list); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +/* + * Unoptimize (replace a jump with a breakpoint and remove the breakpoint + * if need) kprobes listed on unoptimizing_list. + */ +static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + /* Unoptimization must be done anytime */ + if (list_empty(&unoptimizing_list)) + return; + + /* Ditto to do_optimize_kprobes */ + get_online_cpus(); + mutex_lock(&text_mutex); + arch_unoptimize_kprobes(&unoptimizing_list, free_list); + /* Loop free_list for disarming */ + list_for_each_entry_safe(op, tmp, free_list, list) { + /* Disarm probes if marked disabled */ + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); + if (kprobe_unused(&op->kp)) { + /* + * Remove unused probes from hash list. After waiting + * for synchronization, these probes are reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes.) + */ + hlist_del_rcu(&op->kp.hlist); + } else + list_del_init(&op->list); } mutex_unlock(&text_mutex); put_online_cpus(); -end: +} + +/* Reclaim all kprobes on the free_list */ +static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, free_list, list) { + BUG_ON(!kprobe_unused(&op->kp)); + list_del_init(&op->list); + free_aggr_kprobe(&op->kp); + } +} + +/* Start optimizer after OPTIMIZE_DELAY passed */ +static __kprobes void kick_kprobe_optimizer(void) +{ + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + LIST_HEAD(free_list); + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + + /* + * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) + * kprobes before waiting for quiesence period. + */ + do_unoptimize_kprobes(&free_list); + + /* + * Step 2: Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* Step 3: Optimize kprobes after quiesence period */ + do_optimize_kprobes(); + + /* Step 4: Free cleaned kprobes after quiesence period */ + do_free_cleaned_kprobes(&free_list); + mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + + /* Step 5: Kick optimizer again if needed */ + if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) + kick_kprobe_optimizer(); + else + /* Wake up all waiters */ + complete_all(&optimizer_comp); +} + +/* Wait for completing optimization and unoptimization */ +static __kprobes void wait_for_kprobe_optimizer(void) +{ + if (delayed_work_pending(&optimizing_work)) + wait_for_completion(&optimizer_comp); } /* Optimize kprobe if p is ready to be optimized */ @@ -495,42 +627,99 @@ static __kprobes void optimize_kprobe(struct kprobe *p) /* Check if it is already optimized. */ if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) return; - op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - list_add(&op->list, &optimizing_list); - if (!delayed_work_pending(&optimizing_work)) - schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); + + if (!list_empty(&op->list)) + /* This is under unoptimizing. Just dequeue the probe */ + list_del_init(&op->list); + else { + list_add(&op->list, &optimizing_list); + kick_kprobe_optimizer(); + } +} + +/* Short cut to direct unoptimizing */ +static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +{ + get_online_cpus(); + arch_unoptimize_kprobe(op); + put_online_cpus(); + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); } /* Unoptimize a kprobe if p is optimized */ -static __kprobes void unoptimize_kprobe(struct kprobe *p) +static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) { struct optimized_kprobe *op; - if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { - op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) - /* Dequeue from the optimization queue */ + if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) + return; /* This is not an optprobe nor optimized */ + + op = container_of(p, struct optimized_kprobe, kp); + if (!kprobe_optimized(p)) { + /* Unoptimized or unoptimizing case */ + if (force && !list_empty(&op->list)) { + /* + * Only if this is unoptimizing kprobe and forced, + * forcibly unoptimize it. (No need to unoptimize + * unoptimized kprobe again :) + */ list_del_init(&op->list); - else - /* Replace jump with break */ - arch_unoptimize_kprobe(op); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + force_unoptimize_kprobe(op); + } + return; + } + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + return; + } + /* Optimized kprobe case */ + if (force) + /* Forcibly update the code: this is a special case */ + force_unoptimize_kprobe(op); + else { + list_add(&op->list, &unoptimizing_list); + kick_kprobe_optimizer(); } } +/* Cancel unoptimizing for reusing */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + struct optimized_kprobe *op; + + BUG_ON(!kprobe_unused(ap)); + /* + * Unused kprobe MUST be on the way of delayed unoptimizing (means + * there is still a relative jump) and disabled. + */ + op = container_of(ap, struct optimized_kprobe, kp); + if (unlikely(list_empty(&op->list))) + printk(KERN_WARNING "Warning: found a stray unused " + "aggrprobe@%p\n", ap->addr); + /* Enable the probe again */ + ap->flags &= ~KPROBE_FLAG_DISABLED; + /* Optimize it again (remove from op->list) */ + BUG_ON(!kprobe_optready(ap)); + optimize_kprobe(ap); +} + /* Remove optimized instructions */ static void __kprobes kill_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - if (!list_empty(&op->list)) { - /* Dequeue from the optimization queue */ + if (!list_empty(&op->list)) + /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; - } - /* Don't unoptimize, because the target code will be freed. */ + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } @@ -543,16 +732,6 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p) arch_prepare_optimized_kprobe(op); } -/* Free optimized instructions and optimized_kprobe */ -static __kprobes void free_aggr_kprobe(struct kprobe *p) -{ - struct optimized_kprobe *op; - - op = container_of(p, struct optimized_kprobe, kp); - arch_remove_optimized_kprobe(op); - kfree(op); -} - /* Allocate new optimized_kprobe and try to prepare optimized instructions */ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { @@ -587,7 +766,8 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ - free_aggr_kprobe(ap); + arch_remove_optimized_kprobe(op); + kfree(op); return; } @@ -631,21 +811,16 @@ static void __kprobes unoptimize_all_kprobes(void) return; kprobes_allow_optimization = false; - printk(KERN_INFO "Kprobes globally unoptimized\n"); - get_online_cpus(); /* For avoiding text_mutex deadlock */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!kprobe_disabled(p)) - unoptimize_kprobe(p); + unoptimize_kprobe(p, false); } } - - mutex_unlock(&text_mutex); - put_online_cpus(); - /* Allow all currently running kprobes to complete */ - synchronize_sched(); + /* Wait for unoptimizing completion */ + wait_for_kprobe_optimizer(); + printk(KERN_INFO "Kprobes globally unoptimized\n"); } int sysctl_kprobes_optimization; @@ -669,44 +844,60 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ +/* Put a breakpoint for a probe. Must be called with text_mutex locked */ static void __kprobes __arm_kprobe(struct kprobe *p) { - struct kprobe *old_p; + struct kprobe *_p; /* Check collision with other optimized kprobes */ - old_p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(old_p)) - unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p)) + /* Fallback to unoptimized kprobe */ + unoptimize_kprobe(_p, true); arch_arm_kprobe(p); optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -static void __kprobes __disarm_kprobe(struct kprobe *p) +/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ +static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) { - struct kprobe *old_p; + struct kprobe *_p; - unoptimize_kprobe(p); /* Try to unoptimize */ - arch_disarm_kprobe(p); + unoptimize_kprobe(p, false); /* Try to unoptimize */ - /* If another kprobe was blocked, optimize it. */ - old_p = get_optimized_kprobe((unsigned long)p->addr); - if (unlikely(old_p)) - optimize_kprobe(old_p); + if (!kprobe_queued(p)) { + arch_disarm_kprobe(p); + /* If another kprobe was blocked, optimize it. */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p) && reopt) + optimize_kprobe(_p); + } + /* TODO: reoptimize others after unoptimized this probe */ } #else /* !CONFIG_OPTPROBES */ #define optimize_kprobe(p) do {} while (0) -#define unoptimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p, f) do {} while (0) #define kill_optimized_kprobe(p) do {} while (0) #define prepare_optimized_kprobe(p) do {} while (0) #define try_to_optimize_kprobe(p) do {} while (0) #define __arm_kprobe(p) arch_arm_kprobe(p) -#define __disarm_kprobe(p) arch_disarm_kprobe(p) +#define __disarm_kprobe(p, o) arch_disarm_kprobe(p) +#define kprobe_disarmed(p) kprobe_disabled(p) +#define wait_for_kprobe_optimizer() do {} while (0) + +/* There should be no unused kprobes can be reused without optimization */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + BUG_ON(kprobe_unused(ap)); +} static __kprobes void free_aggr_kprobe(struct kprobe *p) { + arch_remove_kprobe(p); kfree(p); } @@ -732,11 +923,10 @@ static void __kprobes arm_kprobe(struct kprobe *kp) /* Disarm a kprobe with text_mutex */ static void __kprobes disarm_kprobe(struct kprobe *kp) { - get_online_cpus(); /* For avoiding text_mutex deadlock */ + /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp); + __disarm_kprobe(kp, true); mutex_unlock(&text_mutex); - put_online_cpus(); } /* @@ -942,7 +1132,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); if (p->break_handler || p->post_handler) - unoptimize_kprobe(ap); /* Fall back to normal kprobe */ + unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ if (p->break_handler) { if (ap->break_handler) @@ -993,19 +1183,21 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * This is the second or subsequent kprobe at the address - handle * the intricacies */ -static int __kprobes register_aggr_kprobe(struct kprobe *old_p, +static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { int ret = 0; - struct kprobe *ap = old_p; + struct kprobe *ap = orig_p; - if (!kprobe_aggrprobe(old_p)) { - /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ - ap = alloc_aggr_kprobe(old_p); + if (!kprobe_aggrprobe(orig_p)) { + /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(orig_p); if (!ap) return -ENOMEM; - init_aggr_kprobe(ap, old_p); - } + init_aggr_kprobe(ap, orig_p); + } else if (kprobe_unused(ap)) + /* This probe is going to die. Rescue it */ + reuse_unused_kprobe(ap); if (kprobe_gone(ap)) { /* @@ -1039,23 +1231,6 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, return add_new_kprobe(ap, p); } -/* Try to disable aggr_kprobe, and return 1 if succeeded.*/ -static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) -{ - struct kprobe *kp; - - list_for_each_entry_rcu(kp, &p->list, list) { - if (!kprobe_disabled(kp)) - /* - * There is an active probe on the list. - * We can't disable aggr_kprobe. - */ - return 0; - } - p->flags |= KPROBE_FLAG_DISABLED; - return 1; -} - static int __kprobes in_kprobes_functions(unsigned long addr) { struct kprobe_blackpoint *kb; @@ -1098,34 +1273,33 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) /* Check passed kprobe is valid and return kprobe in kprobe_table. */ static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) { - struct kprobe *old_p, *list_p; + struct kprobe *ap, *list_p; - old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) + ap = get_kprobe(p->addr); + if (unlikely(!ap)) return NULL; - if (p != old_p) { - list_for_each_entry_rcu(list_p, &old_p->list, list) + if (p != ap) { + list_for_each_entry_rcu(list_p, &ap->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid; return NULL; } valid: - return old_p; + return ap; } /* Return error if the kprobe is being re-registered */ static inline int check_kprobe_rereg(struct kprobe *p) { int ret = 0; - struct kprobe *old_p; mutex_lock(&kprobe_mutex); - old_p = __get_valid_kprobe(p); - if (old_p) + if (__get_valid_kprobe(p)) ret = -EINVAL; mutex_unlock(&kprobe_mutex); + return ret; } @@ -1229,67 +1403,121 @@ fail_with_jump_label: } EXPORT_SYMBOL_GPL(register_kprobe); +/* Check if all probes on the aggrprobe are disabled */ +static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &ap->list, list) + if (!kprobe_disabled(kp)) + /* + * There is an active probe on the list. + * We can't disable this ap. + */ + return 0; + + return 1; +} + +/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ +static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +{ + struct kprobe *orig_p; + + /* Get an original kprobe for return */ + orig_p = __get_valid_kprobe(p); + if (unlikely(orig_p == NULL)) + return NULL; + + if (!kprobe_disabled(p)) { + /* Disable probe if it is a child probe */ + if (p != orig_p) + p->flags |= KPROBE_FLAG_DISABLED; + + /* Try to disarm and disable this/parent probe */ + if (p == orig_p || aggr_kprobe_disabled(orig_p)) { + disarm_kprobe(orig_p); + orig_p->flags |= KPROBE_FLAG_DISABLED; + } + } + + return orig_p; +} + /* * Unregister a kprobe without a scheduler synchronization. */ static int __kprobes __unregister_kprobe_top(struct kprobe *p) { - struct kprobe *old_p, *list_p; + struct kprobe *ap, *list_p; - old_p = __get_valid_kprobe(p); - if (old_p == NULL) + /* Disable kprobe. This will disarm it if needed. */ + ap = __disable_kprobe(p); + if (ap == NULL) return -EINVAL; - if (old_p == p || - (kprobe_aggrprobe(old_p) && - list_is_singular(&old_p->list))) { + if (ap == p) /* - * Only probe on the hash list. Disarm only if kprobes are - * enabled and not gone - otherwise, the breakpoint would - * already have been removed. We save on flushing icache. + * This probe is an independent(and non-optimized) kprobe + * (not an aggrprobe). Remove from the hash list. */ - if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) - disarm_kprobe(old_p); - hlist_del_rcu(&old_p->hlist); - } else { + goto disarmed; + + /* Following process expects this probe is an aggrprobe */ + WARN_ON(!kprobe_aggrprobe(ap)); + + if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) + /* + * !disarmed could be happen if the probe is under delayed + * unoptimizing. + */ + goto disarmed; + else { + /* If disabling probe has special handlers, update aggrprobe */ if (p->break_handler && !kprobe_gone(p)) - old_p->break_handler = NULL; + ap->break_handler = NULL; if (p->post_handler && !kprobe_gone(p)) { - list_for_each_entry_rcu(list_p, &old_p->list, list) { + list_for_each_entry_rcu(list_p, &ap->list, list) { if ((list_p != p) && (list_p->post_handler)) goto noclean; } - old_p->post_handler = NULL; + ap->post_handler = NULL; } noclean: + /* + * Remove from the aggrprobe: this path will do nothing in + * __unregister_kprobe_bottom(). + */ list_del_rcu(&p->list); - if (!kprobe_disabled(old_p)) { - try_to_disable_aggr_kprobe(old_p); - if (!kprobes_all_disarmed) { - if (kprobe_disabled(old_p)) - disarm_kprobe(old_p); - else - /* Try to optimize this probe again */ - optimize_kprobe(old_p); - } - } + if (!kprobe_disabled(ap) && !kprobes_all_disarmed) + /* + * Try to optimize this probe again, because post + * handler may have been changed. + */ + optimize_kprobe(ap); } return 0; + +disarmed: + BUG_ON(!kprobe_disarmed(ap)); + hlist_del_rcu(&ap->hlist); + return 0; } static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) { - struct kprobe *old_p; + struct kprobe *ap; if (list_empty(&p->list)) + /* This is an independent kprobe */ arch_remove_kprobe(p); else if (list_is_singular(&p->list)) { - /* "p" is the last child of an aggr_kprobe */ - old_p = list_entry(p->list.next, struct kprobe, list); + /* This is the last child of an aggrprobe */ + ap = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); - arch_remove_kprobe(old_p); - free_aggr_kprobe(old_p); + free_aggr_kprobe(ap); } + /* Otherwise, do nothing. */ } int __kprobes register_kprobes(struct kprobe **kps, int num) @@ -1607,29 +1835,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) int __kprobes disable_kprobe(struct kprobe *kp) { int ret = 0; - struct kprobe *p; mutex_lock(&kprobe_mutex); - /* Check whether specified probe is valid. */ - p = __get_valid_kprobe(kp); - if (unlikely(p == NULL)) { + /* Disable this kprobe */ + if (__disable_kprobe(kp) == NULL) ret = -EINVAL; - goto out; - } - /* If the probe is already disabled (or gone), just return */ - if (kprobe_disabled(kp)) - goto out; - - kp->flags |= KPROBE_FLAG_DISABLED; - if (p != kp) - /* When kp != p, p is always enabled. */ - try_to_disable_aggr_kprobe(p); - - if (!kprobes_all_disarmed && kprobe_disabled(p)) - disarm_kprobe(p); -out: mutex_unlock(&kprobe_mutex); return ret; } @@ -1927,36 +2139,27 @@ static void __kprobes disarm_all_kprobes(void) mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ - if (kprobes_all_disarmed) - goto already_disabled; + if (kprobes_all_disarmed) { + mutex_unlock(&kprobe_mutex); + return; + } kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - /* - * Here we call get_online_cpus() for avoiding text_mutex deadlock, - * because disarming may also unoptimize kprobes. - */ - get_online_cpus(); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p); + __disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); - put_online_cpus(); mutex_unlock(&kprobe_mutex); - /* Allow all currently running kprobes to complete */ - synchronize_sched(); - return; -already_disabled: - mutex_unlock(&kprobe_mutex); - return; + /* Wait for disarming all kprobes by optimizer */ + wait_for_kprobe_optimizer(); } /* diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d..5355cfd44a3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { - struct sched_param param = { .sched_priority = 0 }; + static struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); @@ -265,6 +265,17 @@ int kthreadd(void *unused) return 0; } +void __init_kthread_worker(struct kthread_worker *worker, + const char *name, + struct lock_class_key *key) +{ + spin_lock_init(&worker->lock); + lockdep_set_class_and_name(&worker->lock, key, name); + INIT_LIST_HEAD(&worker->work_list); + worker->task = NULL; +} +EXPORT_SYMBOL_GPL(__init_kthread_worker); + /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 59b76c8ce9d..1969d2fc4b3 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -494,7 +494,6 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) namelen += 2; for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contention_point[i] == 0) @@ -503,15 +502,13 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contention_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contention_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contention_point[i], + ip, (void *)class->contention_point[i]); } for (i = 0; i < LOCKSTAT_POINTS; i++) { - char sym[KSYM_SYMBOL_LEN]; char ip[32]; if (class->contending_point[i] == 0) @@ -520,12 +517,11 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (!i) seq_line(m, '-', 40-namelen, namelen); - sprint_symbol(sym, class->contending_point[i]); snprintf(ip, sizeof(ip), "[<%p>]", (void *)class->contending_point[i]); - seq_printf(m, "%40s %14lu %29s %s\n", name, - stats->contending_point[i], - ip, sym); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contending_point[i], + ip, (void *)class->contending_point[i]); } if (i) { seq_puts(m, "\n"); diff --git a/kernel/module.c b/kernel/module.c index d190664f25f..34e00b708fa 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,6 +56,7 @@ #include <linux/percpu.h> #include <linux/kmemleak.h> #include <linux/jump_label.h> +#include <linux/pfn.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -70,6 +71,26 @@ #define ARCH_SHF_SMALL 0 #endif +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_DEBUG_SET_MODULE_RONX=y + */ +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +# define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif + +/* + * Given BASE and SIZE this macro calculates the number of pages the + * memory regions occupies + */ +#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ + (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ + PFN_DOWN((unsigned long)BASE) + 1) \ + : (0UL)) + /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -1542,6 +1563,115 @@ static int __unlink_module(void *_mod) return 0; } +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + */ +void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +{ + unsigned long begin_pfn = PFN_DOWN((unsigned long)start); + unsigned long end_pfn = PFN_DOWN((unsigned long)end); + + if (end_pfn > begin_pfn) + set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +} + +static void set_section_ro_nx(void *base, + unsigned long text_size, + unsigned long ro_size, + unsigned long total_size) +{ + /* begin and end PFNs of the current subsection */ + unsigned long begin_pfn; + unsigned long end_pfn; + + /* + * Set RO for module text and RO-data: + * - Always protect first page. + * - Do not protect last partial page. + */ + if (ro_size > 0) + set_page_attributes(base, base + ro_size, set_memory_ro); + + /* + * Set NX permissions for module data: + * - Do not protect first partial page. + * - Always protect last page. + */ + if (total_size > text_size) { + begin_pfn = PFN_UP((unsigned long)base + text_size); + end_pfn = PFN_UP((unsigned long)base + total_size); + if (end_pfn > begin_pfn) + set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + } +} + +/* Setting memory back to RW+NX before releasing it */ +void unset_section_ro_nx(struct module *mod, void *module_region) +{ + unsigned long total_pages; + + if (mod->module_core == module_region) { + /* Set core as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); + set_memory_nx((unsigned long)mod->module_core, total_pages); + set_memory_rw((unsigned long)mod->module_core, total_pages); + + } else if (mod->module_init == module_region) { + /* Set init as NX+RW */ + total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); + set_memory_nx((unsigned long)mod->module_init, total_pages); + set_memory_rw((unsigned long)mod->module_init, total_pages); + } +} + +/* Iterate through all modules and set each module's text as RW */ +void set_all_modules_text_rw() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_rw); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_rw); + } + } + mutex_unlock(&module_mutex); +} + +/* Iterate through all modules and set each module's text as RO */ +void set_all_modules_text_ro() +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_ro); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_ro); + } + } + mutex_unlock(&module_mutex); +} +#else +static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } +#endif + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1566,6 +1696,7 @@ static void free_module(struct module *mod) destroy_params(mod->kp, mod->num_kp); /* This may be NULL, but that's OK */ + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1574,6 +1705,7 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ + unset_section_ro_nx(mod, mod->module_core); module_free(mod, mod->module_core); #ifdef CONFIG_MPU @@ -1777,8 +1909,19 @@ static void layout_sections(struct module *mod, struct load_info *info) s->sh_entsize = get_offset(mod, &mod->core_size, s, i); DEBUGP("\t%s\n", name); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->core_size = debug_align(mod->core_size); mod->core_text_size = mod->core_size; + break; + case 1: /* RO: text and ro-data */ + mod->core_size = debug_align(mod->core_size); + mod->core_ro_size = mod->core_size; + break; + case 3: /* whole core */ + mod->core_size = debug_align(mod->core_size); + break; + } } DEBUGP("Init section allocation order:\n"); @@ -1796,8 +1939,19 @@ static void layout_sections(struct module *mod, struct load_info *info) | INIT_OFFSET_MASK); DEBUGP("\t%s\n", sname); } - if (m == 0) + switch (m) { + case 0: /* executable */ + mod->init_size = debug_align(mod->init_size); mod->init_text_size = mod->init_size; + break; + case 1: /* RO: text and ro-data */ + mod->init_size = debug_align(mod->init_size); + mod->init_ro_size = mod->init_size; + break; + case 3: /* whole init */ + mod->init_size = debug_align(mod->init_size); + break; + } } } @@ -2722,6 +2876,18 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -2765,6 +2931,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, mod->symtab = mod->core_symtab; mod->strtab = mod->core_strtab; #endif + unset_section_ro_nx(mod, mod->module_init); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex.c b/kernel/mutex.c index 200407c1502..a5889fb28ec 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax(); + arch_mutex_cpu_relax(); } #endif spin_lock_mutex(&lock->wait_lock, flags); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2870feee81d..11847bf1e8c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> @@ -21,7 +22,9 @@ #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> +#include <linux/reboot.h> #include <linux/vmstat.h> +#include <linux/device.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> @@ -133,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx) } } +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + /* * If we inherit events we want to return the parent event id * to userspace. @@ -312,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + static void perf_group_attach(struct perf_event *event) { - struct perf_event *group_leader = event->group_leader; + struct perf_event *group_leader = event->group_leader, *pos; /* * We can have double attach due to group movement in perf_event_open. @@ -333,6 +433,11 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); } /* @@ -391,7 +496,7 @@ static void perf_group_detach(struct perf_event *event) if (event->group_leader != event) { list_del_init(&event->group_entry); event->group_leader->nr_siblings--; - return; + goto out; } if (!list_empty(&event->group_entry)) @@ -410,6 +515,12 @@ static void perf_group_detach(struct perf_event *event) /* Inherit group flags from the previous leader */ sibling->group_flags = event->group_flags; } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); } static inline int @@ -1073,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh) /* * not supported on inherited events */ - if (event->attr.inherit) + if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); @@ -2289,31 +2400,6 @@ static int perf_release(struct inode *inode, struct file *file) return perf_event_release_kernel(event); } -static int perf_event_read_size(struct perf_event *event) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -2428,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) if (event->state == PERF_EVENT_STATE_ERROR) return 0; - if (count < perf_event_read_size(event)) + if (count < event->read_size) return -ENOSPC; WARN_ON_ONCE(event->ctx->parent_ctx); @@ -2514,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) int ret = 0; u64 value; - if (!event->attr.sample_period) + if (!is_sampling_event(event)) return -EINVAL; if (copy_from_user(&value, arg, sizeof(value))) @@ -3305,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, } while (len); } +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) @@ -3312,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; + struct perf_sample_data sample_data; struct { struct perf_event_header header; u64 id; @@ -3338,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle, goto out; have_lost = local_read(&buffer->lost); - if (have_lost) - size += sizeof(lost_event); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } perf_output_get_handle(handle); @@ -3370,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle, if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); } return 0; @@ -3407,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_tgid_nr_ns(p, event->ns); -} - -static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) -{ - /* - * only top level events have the pid namespace they were created in - */ - if (event->parent) - event = event->parent; - - return task_pid_nr_ns(p, event->ns); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -3603,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header, { u64 sample_type = event->attr.sample_type; - data->type = sample_type; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header); + header->size = sizeof(*header) + event->header_size; header->misc = 0; header->misc |= perf_misc_flags(regs); - if (sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); - - header->size += sizeof(data->ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_event_pid(event, current); - data->tid_entry.tid = perf_event_tid(event, current); - - header->size += sizeof(data->tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - data->time = perf_clock(); - - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_event_id(event); - - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = event->id; - - header->size += sizeof(data->stream_id); - } - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); + __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_event_read_size(event); + if (sample_type & PERF_SAMPLE_IP) + data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -3722,23 +3813,26 @@ perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; + struct perf_sample_data sample; struct perf_read_event read_event = { .header = { .type = PERF_RECORD_READ, .misc = 0, - .size = sizeof(read_event) + perf_event_read_size(event), + .size = sizeof(read_event) + event->read_size, }, .pid = perf_event_pid(event, task), .tid = perf_event_tid(event, task), }; int ret; + perf_event_header__init_id(&read_event.header, &sample, event); ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); if (ret) return; perf_output_put(&handle, read_event); perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } @@ -3768,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; + struct perf_sample_data sample; struct task_struct *task = task_event->task; - int size, ret; + int ret, size = task_event->event_id.header.size; - size = task_event->event_id.header.size; - ret = perf_output_begin(&handle, event, size, 0, 0); + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3785,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; } static int perf_event_task_match(struct perf_event *event) @@ -3900,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event, struct perf_comm_event *comm_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = comm_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; comm_event->event_id.pid = perf_event_pid(event, comm_event->task); comm_event->event_id.tid = perf_event_tid(event, comm_event->task); @@ -3912,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event, perf_output_put(&handle, comm_event->event_id); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; } static int perf_event_comm_match(struct perf_event *event) @@ -3957,7 +4067,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->comm_size = size; comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); @@ -4038,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_mmap_event *mmap_event) { struct perf_output_handle handle; + struct perf_sample_data sample; int size = mmap_event->event_id.header.size; - int ret = perf_output_begin(&handle, event, size, 0, 0); + int ret; + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); if (ret) - return; + goto out; mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); @@ -4050,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_put(&handle, mmap_event->event_id); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; } static int perf_event_mmap_match(struct perf_event *event, @@ -4205,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma) static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; + struct perf_sample_data sample; int ret; struct { @@ -4226,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable) if (enable) throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); if (ret) return; perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); } @@ -4246,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + if (!throttle) { hwc->interrupts++; } else { @@ -4391,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!regs) return; - if (!hwc->sample_period) + if (!is_sampling_event(event)) return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) @@ -4554,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct hlist_head *head; - if (hwc->sample_period) { + if (is_sampling_event(event)) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } @@ -4811,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((event->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - err = perf_trace_init(event); if (err) return err; @@ -4842,7 +4963,7 @@ static struct pmu perf_tracepoint = { static inline void perf_tp_register(void) { - perf_pmu_register(&perf_tracepoint); + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4932,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) static void perf_swevent_start_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - s64 period = local64_read(&hwc->period_left); - if (period) { - if (period < 0) - period = 10000; + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; - local64_set(&hwc->period_left, 0); - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL_PINNED, 0); - } } static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (hwc->sample_period) { + if (is_sampling_event(event)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -5184,8 +5307,61 @@ static void free_pmu_context(struct pmu *pmu) out: mutex_unlock(&pmus_lock); } +static struct idr pmu_idr; + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} -int perf_pmu_register(struct pmu *pmu) +int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5195,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu) if (!pmu->pmu_disable_count) goto unlock; + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + +skip_type: pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); if (pmu->pmu_cpu_context) goto got_cpu_context; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) - goto free_pdc; + goto free_dev; for_each_possible_cpu(cpu) { struct perf_cpu_context *cpuctx; @@ -5245,6 +5446,14 @@ unlock: return ret; +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; @@ -5264,6 +5473,10 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); free_pmu_context(pmu); } @@ -5273,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event) int idx; idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) + goto unlock; + list_for_each_entry_rcu(pmu, &pmus, entry) { int ret = pmu->event_init(event); if (!ret) @@ -5738,6 +5958,12 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(¤t->perf_event_mutex); /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + + /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction * of the group leader will find the pointer to itself in @@ -6090,6 +6316,12 @@ inherit_event(struct perf_event *parent_event, child_event->overflow_handler = parent_event->overflow_handler; /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + perf_event__id_header_size(child_event); + + /* * Link it up in the child's context: */ raw_spin_lock_irqsave(&child_ctx->lock, flags); @@ -6320,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) mutex_unlock(&swhash->hlist_mutex); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC static void perf_pmu_rotate_stop(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -6374,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu) static inline void perf_event_exit_cpu(int cpu) { } #endif +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + static int __cpuinit perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -6402,14 +6654,45 @@ void __init perf_event_init(void) { int ret; + idr_init(&pmu_idr); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); - perf_pmu_register(&perf_swevent); - perf_pmu_register(&perf_cpu_clock); - perf_pmu_register(&perf_task_clock); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9ca4973f736..93bd2eb2bc5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -145,7 +145,13 @@ static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { @@ -619,7 +625,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ecf770509d0..031d5e3a619 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/suspend.h> +#include <trace/events/power.h> #include "power.h" @@ -201,6 +202,7 @@ int suspend_devices_and_enter(suspend_state_t state) if (!suspend_ops) return -ENOSYS; + trace_machine_suspend(state); if (suspend_ops->begin) { error = suspend_ops->begin(state); if (error) @@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (suspend_ops->end) suspend_ops->end(); + trace_machine_suspend(PWR_EVENT_EXIT); return error; Recover_platform: diff --git a/kernel/printk.c b/kernel/printk.c index a23315dc449..ab3ffc5b3b6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending); void printk_tick(void) { - if (__get_cpu_var(printk_pending)) { - __get_cpu_var(printk_pending) = 0; + if (__this_cpu_read(printk_pending)) { + __this_cpu_write(printk_pending, 0); wake_up_interruptible(&log_wait); } } int printk_needs_cpu(int cpu) { - if (unlikely(cpu_is_offline(cpu))) + if (cpu_is_offline(cpu)) printk_tick(); - return per_cpu(printk_pending, cpu); + return __this_cpu_read(printk_pending); } void wake_up_klogd(void) diff --git a/kernel/sched.c b/kernel/sched.c index e6f8f125431..04949089e76 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -75,9 +75,11 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #include "sched_cpupri.h" #include "workqueue_sched.h" +#include "sched_autogroup.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -253,6 +255,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -268,24 +272,19 @@ struct task_group { struct task_group *parent; struct list_head siblings; struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif }; #define root_task_group init_task_group -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return list_empty(&root_task_group.children); -} -#endif - # define INIT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -342,6 +341,7 @@ struct cfs_rq { * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ + int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ @@ -360,14 +360,17 @@ struct cfs_rq { unsigned long h_load; /* - * this cpu's part of tg->shares + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq) */ static inline struct task_group *task_group(struct task_struct *p) { + struct task_group *tg; struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); - return container_of(css, struct task_group, css); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -793,20 +799,6 @@ late_initcall(sched_init_debug); const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - -/* * period over which we average the RT time consumption, measured * in ms. * @@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} - -/* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; - unsigned long flags; - int i; - - if (!tg->se[0]) - return 0; - - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; - - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } - - if (!rq_weight) - rq_weight = sum_weight; - - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; - - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; - - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); - - local_irq_restore(flags); - - return 0; -} - /* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child @@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data) load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(struct sched_domain *sd) -{ - s64 elapsed; - u64 now; - - if (root_task_group_empty()) - return; - - now = local_clock(); - elapsed = now - sd->last_update; - - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } -} - static void update_h_load(long cpu) { walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -#else - -static inline void update_shares(struct sched_domain *sd) -{ -} - #endif #ifdef CONFIG_PREEMPT @@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_autogroup.c" #include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" @@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, int dest_cpu) +static bool migrate_task(struct task_struct *p, struct rq *rq) { - struct rq *rq = task_rq(p); - /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. @@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (unlikely(dest_cpu >= nr_cpu_ids)) { - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags) /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif +#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif put_cpu(); } @@ -3549,7 +3418,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) if (task_thread_info(rq->curr) != owner || need_resched()) return 0; - cpu_relax(); + arch_mutex_cpu_relax(); } return 1; @@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); * This waits for either a completion of a specific task to be signaled or for a * specified timeout to expire. It is interruptible. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) { @@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); * signaled or for a specified timeout to expire. It can be * interrupted by a kill signal. The timeout is in jiffies. */ -unsigned long __sched +long __sched wait_for_completion_killable_timeout(struct completion *x, unsigned long timeout) { @@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -5056,7 +4925,7 @@ recheck: * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } @@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } @@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + printk(KERN_INFO "%-15.15s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) @@ -5754,7 +5623,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -5830,7 +5698,7 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, dest_cpu)) { + if (migrate_task(p, rq)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); @@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data) } #ifdef CONFIG_HOTPLUG_CPU + /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. */ -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void) { - struct rq *rq = cpu_rq(dead_cpu); - int needs_cpu, uninitialized_var(dest_cpu); - unsigned long flags; + struct mm_struct *mm = current->active_mm; - local_irq_save(flags); + BUG_ON(cpu_online(smp_processor_id())); - raw_spin_lock(&rq->lock); - needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); - if (needs_cpu) - dest_cpu = select_fallback_rq(dead_cpu, p); - raw_spin_unlock(&rq->lock); - /* - * It can only fail if we race with set_cpus_allowed(), - * in the racer should migrate the task anyway. - */ - if (needs_cpu) - __migrate_task(p, dead_cpu, dest_cpu); - local_irq_restore(flags); + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); } /* @@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) static void migrate_nr_uninterruptible(struct rq *rq_src) { struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - unsigned long flags; - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); } /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks. */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq) { - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - raw_spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - activate_task(rq, p, 0); - - raw_spin_unlock_irqrestore(&rq->lock, flags); + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. */ - raw_spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - raw_spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + rq->stop = NULL; for ( ; ; ) { - if (!rq->nr_running) + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) break; + next = pick_next_task(rq); - if (!next) - break; + BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); } -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + rq->stop = stop; } + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_live_tasks(cpu); - /* Idle task back to normal (off runqueue, low prio) */ - raw_spin_lock_irq(&rq->lock); - deactivate_task(rq, rq->idle, 0); - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - raw_spin_unlock_irq(&rq->lock); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - calc_global_load_remove(rq); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); break; #endif } @@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, + struct sched_entity *se, int cpu, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; /* se could be NULL for init_task_group */ @@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, 0); se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent) { struct rq *rq = cpu_rq(cpu); @@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; if (!rt_se) @@ -8164,13 +7946,9 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); - + autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8184,7 +7962,6 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); -#ifdef CONFIG_CGROUP_SCHED /* * How much cpu bandwidth does init_task_group get? * @@ -8204,16 +7981,13 @@ void __init sched_init(void) * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); -#endif + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); -#ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); -#endif + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -8293,8 +8067,6 @@ void __init sched_init(void) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_event_init(); - scheduler_running = 1; } @@ -8488,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } return 1; @@ -8499,15 +8271,21 @@ err: return 0; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) @@ -8520,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } @@ -8578,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } return 1; @@ -8588,17 +8362,6 @@ err_free_rq: err: return 0; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} #else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { @@ -8609,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED @@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; - int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) goto err; spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } list_add_rcu(&tg->list, &task_groups); WARN_ON(!parent); /* root should already exist */ @@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) unsigned long flags; int i; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { + /* end participation in shares distribution */ + for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } + + spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static void __set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - int on_rq; - - on_rq = se->on_rq; - if (on_rq) - dequeue_entity(cfs_rq, se, 0); - - se->load.weight = shares; - se->load.inv_weight = 0; - - if (on_rq) - enqueue_entity(cfs_rq, se, 0); -} - -static void set_se_shares(struct sched_entity *se, unsigned long shares) -{ - struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - __set_se_shares(se, shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) @@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ tg->shares = shares; for_each_possible_cpu(i) { - /* - * force a rebalance - */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); - set_se_shares(tg->se[i], shares); + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); } - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); return 0; diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 00000000000..c80fedcd476 --- /dev/null +++ b/kernel/sched_autogroup.c @@ -0,0 +1,238 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kallsyms.h> +#include <linux/utsname.h> + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +static void autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &init_task_group; + init_task_group.autogroup = &autogroup_default; + kref_init(&autogroup_default.kref); + init_rwsem(&autogroup_default.lock); + init_task->signal->autogroup = &autogroup_default; +} + +static inline void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&init_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + init_rwsem(&ag->lock); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; + tg->autogroup = ag; + + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +static inline bool +task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ + if (tg != &root_task_group) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + /* + * We can only assume the task group can't go away on us if + * autogroup_move_group() can see us on ->thread_group list. + */ + if (p->flags & PF_EXITING) + return false; + + return true; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + BUG_ON(!lock_task_sighand(p, &flags)); + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra refrence added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ + sysctl_sched_autogroup_enabled = 0; + + return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (*nice < -20 || *nice > 19) + return -EINVAL; + + err = security_task_setnice(current, *nice); + if (err) + return err; + + if (*nice < 0 && !can_nice(current, *nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + if (!err) + ag->nice = *nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 00000000000..5358e241cb2 --- /dev/null +++ b/kernel/sched_autogroup.h @@ -0,0 +1,32 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 52f1a149bfb..9d8af0b3fb6 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) } EXPORT_SYMBOL_GPL(sched_clock); -static __read_mostly int sched_clock_running; +__read_mostly int sched_clock_running; #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK __read_mostly int sched_clock_stable; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9..1dfae3d014b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) #ifdef CONFIG_FAIR_GROUP_SCHED -static void print_cfs_group_stats(struct seq_file *m, int cpu, - struct task_group *tg) +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { struct sched_entity *se = tg->se[cpu]; if (!se) @@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif -#ifdef CONFIG_CGROUP_SCHED - { - char path[64]; - - rcu_read_lock(); - cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); - rcu_read_unlock(); - SEQ_printf(m, " %s", path); - } -#endif SEQ_printf(m, "\n"); } @@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_unlock_irqrestore(&tasklist_lock, flags); } -#if defined(CONFIG_CGROUP_SCHED) && \ - (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)) -static void task_group_path(struct task_group *tg, char *buf, int buflen) -{ - /* may be NULL if the underlying cgroup isn't fully-created yet */ - if (!tg->css.cgroup) { - buf[0] = '\0'; - return; - } - cgroup_path(tg->css.cgroup, buf, buflen); -} -#endif - void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, @@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) - char path[128]; - struct task_group *tg = cfs_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); -#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&cfs_rq->tg->load_weight)); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { -#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) - char path[128]; - struct task_group *tg = rt_rq->tg; - - task_group_path(tg, path, sizeof(path)); - - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); -#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); -#endif - #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) #undef P } +extern __read_mostly int sched_clock_running; + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = { static int sched_debug_show(struct seq_file *m, void *v) { - u64 now = ktime_to_ns(ktime_get()); + u64 ktime, sched_clk, cpu_clk; + unsigned long flags; int cpu; - SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + local_irq_save(flags); + ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); +#define P(x) \ + SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); #define P(x) \ SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - P(jiffies); PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 00ebd768667..c62ebae65cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + static const struct sched_class fair_sched_class; /************************************************************** @@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return cfs_rq->tg->cfs_rq[this_cpu]; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return &cpu_rq(this_cpu)->cfs; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; +#endif } static void update_curr(struct cfs_rq *cfs_rq) @@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ + u64 period = sysctl_sched_shares_window; + u64 now, delta; + unsigned long load = cfs_rq->load.weight; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + } + + cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; + cfs_rq->load_period += delta; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } + + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } + + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) { @@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ update_curr(cfs_rq); + /* + * Update share accounting for long-running entities. + */ + update_entity_shares_tick(cfs_rq); + #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother @@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } + hrtick_update(rq); } @@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -1909,6 +2071,48 @@ out: } #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq, 0); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) + update_shares_cpu(cfs_rq->tg, cpu); + rcu_read_unlock(); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return max_load_move - rem_load_move; } #else +static inline void update_shares(int cpu) +{ +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3174,8 +3381,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) */ raw_spin_unlock(&this_rq->lock); + update_shares(this_cpu); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a..68e69acc29b 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_SHARES_UPDATE, 1) -SCHED_FEAT(ASYM_EFF_LOAD, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9..c914ec747ca 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) @@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) @@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + if (head) list_add(&rt_se->run_list, queue); else @@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 18f4be0d5fe..d4d918a9188 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { + .sched_priority = MAX_RT_PRIO-1 + }; p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; diff --git a/kernel/sys.c b/kernel/sys.c index 7f5a0cd296a..2745dcdb6c6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); - if (err > 0) + if (err > 0) { proc_sid_connector(group_leader); + sched_autogroup_create_attach(group_leader); + } return err; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5abfa151855..ae5cbb1e3ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -static int min_sched_shares_ratelimit = 100000; /* 100 usec */ -static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif #ifdef CONFIG_COMPACTION @@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_wakeup_granularity_ns, }, { - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_shares_ratelimit, - .extra2 = &max_sched_shares_ratelimit, - }, - { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), @@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_tunable_scaling, }, { - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, - { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, .maxlen = sizeof(unsigned int), @@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "sched_shares_window", + .data = &sysctl_sched_shares_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "timer_migration", .data = &sysctl_timer_migration, .maxlen = sizeof(unsigned int), @@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -745,21 +744,21 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, + .procname = "nmi_watchdog", + .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dowatchdog_enabled, }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_nmi_enabled, + .proc_handler = proc_dointvec, }, #endif #if defined(CONFIG_X86) diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c578606..4b2545a136f 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -136,7 +136,6 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_NMI_WATCHDOG, "nmi_watchdog" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, {} }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c8231fb1570..3308fd7f1b5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } +#ifdef CONFIG_IA64 +#define TASKSTATS_NEEDS_PADDING 1 +#endif + static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; int aggr; - /* If we don't pad, we end up with alignment on a 4 byte boundary. - * This causes lots of runtime warnings on systems requiring 8 byte - * alignment */ - u32 pids[2] = { pid, 0 }; - int pid_size = ALIGN(sizeof(pid), sizeof(long)); - aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; + /* + * The taskstats structure is internally aligned on 8 byte + * boundaries but the layout of the aggregrate reply, with + * two NLA headers and the pid (each 4 bytes), actually + * force the entire structure to be unaligned. This causes + * the kernel to issue unaligned access warnings on some + * architectures like ia64. Unfortunately, some software out there + * doesn't properly unroll the NLA packet and assumes that the start + * of the taskstats structure will always be 20 bytes from the start + * of the netlink payload. Aligning the start of the taskstats + * structure breaks this software, which we don't want. So, for now + * the alignment only happens on architectures that require it + * and those users will have to update to fixed versions of those + * packages. Space is reserved in the packet only when needed. + * This ifdef should be removed in several years e.g. 2012 once + * we can be confident that fixed versions are installed on most + * systems. We add the padding before the aggregate since the + * aggregate is already a defined type. + */ +#ifdef TASKSTATS_NEEDS_PADDING + if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) + goto err; +#endif na = nla_nest_start(skb, aggr); if (!na) goto err; - if (nla_put(skb, type, pid_size, pids) < 0) + + if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); if (!ret) @@ -456,6 +478,18 @@ out: return rc; } +static size_t taskstats_packet_size(void) +{ + size_t size; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); +#ifdef TASKSTATS_NEEDS_PADDING + size += nla_total_size(0); /* Padding for alignment */ +#endif + return size; +} + static int cmd_attr_pid(struct genl_info *info) { struct taskstats *stats; @@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info) u32 pid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info) u32 tgid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) /* * Size includes space for nested attributes */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c index ac38fbb176c..a9ae369925c 100644 --- a/kernel/time/timecompare.c +++ b/kernel/time/timecompare.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/math64.h> +#include <linux/kernel.h> /* * fixed point arithmetic scale factor for skew @@ -57,11 +58,11 @@ int timecompare_offset(struct timecompare *sync, int index; int num_samples = sync->num_samples; - if (num_samples > sizeof(buffer)/sizeof(buffer[0])) { + if (num_samples > ARRAY_SIZE(buffer)) { samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); if (!samples) { samples = buffer; - num_samples = sizeof(buffer)/sizeof(buffer[0]); + num_samples = ARRAY_SIZE(buffer); } } else { samples = buffer; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 49010d822f7..5bb86da8200 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,6 +32,8 @@ struct timekeeper { cycle_t cycle_interval; /* Number of clock shifted nano seconds in one NTP interval. */ u64 xtime_interval; + /* shifted nano seconds left over when rounding cycle_interval */ + s64 xtime_remainder; /* Raw nano seconds accumulated per NTP interval. */ u32 raw_interval; @@ -62,7 +64,7 @@ struct timekeeper timekeeper; static void timekeeper_setup_internals(struct clocksource *clock) { cycle_t interval; - u64 tmp; + u64 tmp, ntpinterval; timekeeper.clock = clock; clock->cycle_last = clock->read(clock); @@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; + ntpinterval = tmp; tmp += clock->mult/2; do_div(tmp, clock->mult); if (tmp == 0) @@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock) /* Go back from cycles -> shifted ns */ timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; timekeeper.raw_interval = ((u64) interval * clock->mult) >> clock->shift; @@ -719,7 +723,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; - timekeeper.ntp_error -= timekeeper.xtime_interval << + timekeeper.ntp_error -= + (timekeeper.xtime_interval + timekeeper.xtime_remainder) << (timekeeper.ntp_error_shift + shift); return offset; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ab8f5e33fa9..32a19f9397f 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -79,26 +79,26 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, { struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct rb_node *curr; + struct timerqueue_node *curr; unsigned long flags; next_one: i = 0; raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = base->first; + curr = timerqueue_getnext(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = rb_next(curr); + curr = timerqueue_iterate_next(curr); i++; } if (curr) { - timer = rb_entry(curr, struct hrtimer, node); + timer = container_of(curr, struct hrtimer, node); tmp = *timer; raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); diff --git a/kernel/timer.c b/kernel/timer.c index 353b9227c2e..43ca9936f2d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -88,18 +88,6 @@ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; -/* - * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB to - * indicate whether the timer is deferrable. - * - * A deferrable timer will work normally when the system is busy, but - * will not cause a CPU to come out of idle just to service it; instead, - * the timer will be serviced when the CPU eventually wakes up with a - * subsequent non-deferrable timer. - */ -#define TBASE_DEFERRABLE_FLAG (0x1) - /* Functions below help us manage 'deferrable' flag */ static inline unsigned int tbase_get_deferrable(struct tvec_base *base) { @@ -113,8 +101,7 @@ static inline struct tvec_base *tbase_get_base(struct tvec_base *base) static inline void timer_set_deferrable(struct timer_list *timer) { - timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + timer->base = TBASE_MAKE_DEFERRED(timer->base); } static inline void @@ -343,15 +330,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); - -static inline void set_running_timer(struct tvec_base *base, - struct timer_list *timer) -{ -#ifdef CONFIG_SMP - base->running_timer = timer; -#endif -} - static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; @@ -936,15 +914,12 @@ int del_timer(struct timer_list *timer) } EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. - * - * It must not be called from interrupt contexts. */ int try_to_del_timer_sync(struct timer_list *timer) { @@ -973,6 +948,7 @@ out: } EXPORT_SYMBOL(try_to_del_timer_sync); +#ifdef CONFIG_SMP /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -983,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * interrupt contexts. The caller must not hold locks which would prevent + * hardirq contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. @@ -993,14 +969,16 @@ EXPORT_SYMBOL(try_to_del_timer_sync); int del_timer_sync(struct timer_list *timer) { #ifdef CONFIG_LOCKDEP - unsigned long flags; - - local_irq_save(flags); + local_bh_disable(); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - local_irq_restore(flags); + local_bh_enable(); #endif - + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq()); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1111,7 +1089,7 @@ static inline void __run_timers(struct tvec_base *base) timer_stats_account_timer(timer); - set_running_timer(base, timer); + base->running_timer = timer; detach_timer(timer, 1); spin_unlock_irq(&base->lock); @@ -1119,7 +1097,7 @@ static inline void __run_timers(struct tvec_base *base) spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + base->running_timer = NULL; spin_unlock_irq(&base->lock); } @@ -1249,7 +1227,7 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, */ unsigned long get_next_timer_interrupt(unsigned long now) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); unsigned long expires; /* @@ -1298,7 +1276,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __get_cpu_var(tvec_bases); + struct tvec_base *base = __this_cpu_read(tvec_bases); hrtimer_run_pending(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ea37e2ff416..14674dce77a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -69,6 +69,21 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config EVENT_POWER_TRACING_DEPRECATED + depends on EVENT_TRACING + bool "Deprecated power event trace API, to be removed" + default y + help + Provides old power event types: + C-state/idle accounting events: + power:power_start + power:power_end + and old cpufreq accounting event: + power:power_frequency + This is for userspace compatibility + and will vanish after 5 kernel iterations, + namely 2.6.41. + config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index a22582a0616..f55fcf61b22 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -13,5 +13,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/power.h> -EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); +#ifdef EVENT_POWER_TRACING_DEPRECATED +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +#endif +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9ed509a015d..bd1c35a4fbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; @@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, event = rb_reader_event(cpu_buffer); /* Always keep the time extend and data together */ size = rb_event_ts_length(event); - } while (len > size); + } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 39c059ca670..19a359d5e6d 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -21,17 +21,46 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret = -ENOMEM; + int ret; int cpu; + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; + ret = -ENOMEM; + list = alloc_percpu(struct hlist_head); if (!list) goto fail; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0725eeab193..35fde09b81d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,6 +27,12 @@ DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + LIST_HEAD(ftrace_events); LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4ba44deaac2..4b74d71705c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -83,13 +83,19 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + do { \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), FILTER_OTHER); \ - if (ret) \ - return ret; + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); #undef __array_desc #define __array_desc(type, container, item, len) \ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b320..562c56e048f 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/user.c b/kernel/user.c index 2c7d8d5914b..5c598ca781d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6e3c41a4024..6e7b575ac33 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -57,6 +57,8 @@ static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "0", 1)) + no_watchdog = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -307,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -364,7 +366,8 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -547,13 +550,13 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -static int __init spawn_watchdog_task(void) +void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; if (no_watchdog) - return 0; + return; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); WARN_ON(notifier_to_errno(err)); @@ -561,6 +564,5 @@ static int __init spawn_watchdog_task(void) cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - return 0; + return; } -early_initcall(spawn_watchdog_task); |