diff options
Diffstat (limited to 'kernel')
37 files changed, 575 insertions, 252 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index eb0f9165b40..2924251a654 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb) } /* Receive messages from netlink socket. */ -static void audit_receive(struct sock *sk, int length) +static void audit_receive(struct sk_buff *skb) { - struct sk_buff *skb; - unsigned int qlen; - mutex_lock(&audit_cmd_mutex); - - for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - audit_receive_skb(skb); - kfree_skb(skb); - } + audit_receive_skb(skb); mutex_unlock(&audit_cmd_mutex); } @@ -876,8 +868,8 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, - NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, + audit_receive, NULL, THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3401293359e..04f3ffb8d9d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2023,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } - BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); + BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = t->tgid; selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); diff --git a/kernel/cpu.c b/kernel/cpu.c index 181ae708602..38033db8d8e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu) return err; } -#ifdef CONFIG_SUSPEND_SMP +#ifdef CONFIG_PM_SLEEP_SMP static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) @@ -334,4 +334,4 @@ void enable_nonboot_cpus(void) out: mutex_unlock(&cpu_add_remove_lock); } -#endif +#endif /* CONFIG_PM_SLEEP_SMP */ diff --git a/kernel/exit.c b/kernel/exit.c index 9578c1ae19c..993369ee94d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -24,7 +24,6 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> -#include <linux/signalfd.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> @@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); - /* - * Notify that this sighand has been detached. This must - * be called with the tsk->sighand lock held. Also, this - * access tsk->sighand internally, so it must be called - * before tsk->sighand is reset. - */ - signalfd_detach_locked(tsk); - posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); @@ -975,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); @@ -996,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->binfmt) module_put(tsk->binfmt->module); - tsk->exit_code = code; proc_exit_connector(tsk); exit_task_namespaces(tsk); exit_notify(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 7332e236d36..5e67f90a169 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); + init_waitqueue_head(&sighand->signalfd_wqh); } void __init proc_caches_init(void) @@ -1608,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| + CLONE_NEWNET)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) diff --git a/kernel/futex.c b/kernel/futex.c index 3415e9ad139..fcc94e7b408 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1670,6 +1670,7 @@ pi_faulted: attempt); if (ret) goto out; + uval = 0; goto retry_unlocked; } @@ -1942,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -1964,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); - + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* * A pending lock might already be on the list, so * don't process it twice: */ @@ -1977,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1990,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f7921360efa..2c2e2954b71 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - compat_uptr_t uentry, upending; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (upending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); - while (compat_ptr(uentry) != &head->list) { + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: @@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t __user *)&entry->next, &pi)) + if (rc) return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) cond_resched(); } + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } asmlinkage long diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c21ca6bfaa6..dc8a4451d79 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -277,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } EXPORT_SYMBOL_GPL(ktime_add_ns); + +/** + * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable + * @kt: minuend + * @nsec: the scalar nsec value to subtract + * + * Returns the subtraction of @nsec from @kt in ktime_t format + */ +ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_sub(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_sub_ns); # endif /* !CONFIG_KTIME_SCALAR */ /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 203a518b6f1..7230d914eaa 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id) * We do this after actually deregistering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ + local_irq_save(flags); handler(irq, dev_id); + local_irq_restore(flags); } #endif } @@ -545,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, * We do this before actually registering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ - if (irqflags & IRQF_DISABLED) { - unsigned long flags; + unsigned long flags; - local_irq_save(flags); - handler(irq, dev_id); - local_irq_restore(flags); - } else - handler(irq, dev_id); + local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); } #endif diff --git a/kernel/kmod.c b/kernel/kmod.c index 9809cc1f33d..c6a4f8aebeb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -505,7 +505,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, if (ret < 0) goto out; - return call_usermodehelper_exec(sub_info, 1); + return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); out: call_usermodehelper_freeinfo(sub_info); diff --git a/kernel/module.c b/kernel/module.c index 33c04ad5117..db0ead0363e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module *mod, char *buffer) { - /* sysfs holds a reference */ - return sprintf(buffer, "%u\n", module_refcount(mod)-1); + return sprintf(buffer, "%u\n", module_refcount(mod)); } static struct module_attribute refcnt = { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a4fb7d46971..f1decd21a53 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -20,6 +20,7 @@ #include <linux/mnt_namespace.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> +#include <net/net_namespace.h> static struct kmem_cache *nsproxy_cachep; @@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_user; } + new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + if (IS_ERR(new_nsp->net_ns)) { + err = PTR_ERR(new_nsp->net_ns); + goto out_net; + } + return new_nsp; +out_net: + if (new_nsp->user_ns) + put_user_ns(new_nsp->user_ns); out_user: if (new_nsp->pid_ns) put_pid_ns(new_nsp->pid_ns); @@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); if (ns->user_ns) put_user_ns(ns->user_ns); + put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER))) + CLONE_NEWUSER | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 55b3761edaa..57efe0400bc 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) get_task_struct(process); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } else { spin_unlock_irqrestore(&process->sighand->siglock, flags); process = NULL; @@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); if (timr) { spin_lock(&timr->it_lock); - spin_unlock(&idr_lock); if ((timr->it_id != timer_id) || !(timr->it_process) || timr->it_process->tgid != current->tgid) { - unlock_timer(timr, *flags); + spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; - } + } else + spin_unlock(&idr_lock); } else spin_unlock_irqrestore(&idr_lock, *flags); @@ -711,7 +712,7 @@ sys_timer_getoverrun(timer_t timer_id) { struct k_itimer *timr; int overrun; - long flags; + unsigned long flags; timr = lock_timer(timer_id, &flags); if (!timr) @@ -783,7 +784,7 @@ sys_timer_settime(timer_t timer_id, int flags, struct k_itimer *timr; struct itimerspec new_spec, old_spec; int error = 0; - long flag; + unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; if (!new_setting) @@ -835,7 +836,7 @@ asmlinkage long sys_timer_delete(timer_t timer_id) { struct k_itimer *timer; - long flags; + unsigned long flags; retry_delete: timer = lock_timer(timer_id, &flags); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 412859f8d94..14b0e10dc95 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -72,15 +72,10 @@ config PM_TRACE CAUTION: this option will cause your machine's real-time clock to be set to an invalid time after a resume. -config SUSPEND_SMP_POSSIBLE - bool - depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC)) - depends on SMP - default y - -config SUSPEND_SMP +config PM_SLEEP_SMP bool - depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP + depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE + depends on PM_SLEEP select HOTPLUG_CPU default y @@ -89,20 +84,46 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION default y +config SUSPEND_UP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ + || SUPERH || FRV + depends on !SMP + default y + +config SUSPEND_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) \ + || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM + depends on SMP + default y + config SUSPEND bool "Suspend to RAM and standby" depends on PM - depends on !SMP || SUSPEND_SMP_POSSIBLE + depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is powered and thus its contents are preserved, such as the suspend-to-RAM state (i.e. the ACPI S3 state). +config HIBERNATION_UP_POSSIBLE + bool + depends on X86 || PPC64_SWSUSP || PPC32 + depends on !SMP + default y + +config HIBERNATION_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP + depends on SMP + default y + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP - depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE + depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/printk.c b/kernel/printk.c index bd2cd062878..8451dfc31d2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1083,6 +1083,19 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +static int __init disable_boot_consoles(void) +{ + if (console_drivers != NULL) { + if (console_drivers->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + console_drivers->name, console_drivers->index); + return unregister_console(console_drivers); + } + } + return 0; +} +late_initcall(disable_boot_consoles); + /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 82a558b655d..3eca7a55f2e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* protect against de_thread()->release_task() */ diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f..6c10fa796ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -61,6 +61,7 @@ #include <linux/delayacct.h> #include <linux/reciprocal_div.h> #include <linux/unistd.h> +#include <linux/pagemap.h> #include <asm/tlb.h> @@ -262,7 +263,8 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; + u64 idle_clock; + unsigned int clock_deep_idle_events; u64 tick_timestamp; atomic_t nr_iowait; @@ -556,18 +558,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); } +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. @@ -645,7 +669,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) /* * Shift right and round: */ -#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, @@ -661,10 +685,10 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, * Check whether we'd overflow the 64-bit multiplication: */ if (unlikely(tmp > WMULT_CONST)) - tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, WMULT_SHIFT/2); else - tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } @@ -835,7 +859,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq) static void set_load_weight(struct task_struct *p) { - task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; p->se.wait_runtime = 0; if (task_has_rt_policy(p)) { @@ -1564,6 +1587,7 @@ static void __sched_fork(struct task_struct *p) p->se.wait_start_fair = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; @@ -1659,6 +1683,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->prio = effective_prio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || !current->se.on_rq) { @@ -2157,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if too many balance attempts have failed: - */ - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; - return 1; } @@ -2494,7 +2517,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2546,10 +2569,8 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; @@ -3020,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3056,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3067,7 +3091,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -4525,10 +4556,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) - schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's @@ -4884,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; } #ifdef CONFIG_SMP @@ -5234,15 +5266,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static struct ctl_table sd_ctl_dir[] = { { .procname = "sched_domain", - .mode = 0755, + .mode = 0555, }, {0,}, }; static struct ctl_table sd_ctl_root[] = { { + .ctl_name = CTL_KERN, .procname = "kernel", - .mode = 0755, + .mode = 0555, .child = sd_ctl_dir, }, {0,}, @@ -5318,7 +5351,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) for_each_domain(cpu, sd) { snprintf(buf, 32, "domain%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_domain_table(sd); entry++; i++; @@ -5338,7 +5371,7 @@ static void init_sched_domain_sysctl(void) for (i = 0; i < cpu_num; i++, entry++) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); } sd_sysctl_header = register_sysctl_table(sd_ctl_root); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b8..c3ee38bd342 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(next_balance); P(curr->pid); P(clock); + P(idle_clock); P(prev_clock_raw); P(clock_warps); P(clock_overflows); - P(clock_unstable_events); + P(clock_deep_idle_events); P(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); @@ -282,4 +283,5 @@ void proc_sched_set_task(struct task_struct *p) p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; #endif p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fedbb51bba9..67c67a87146 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,34 +15,50 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> */ /* - * Preemption granularity: - * (default: 2 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: + */ +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; +unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; + +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 25 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = - 10000000000ULL/HZ; +unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; /* * SCHED_OTHER wake-up granularity. @@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; unsigned int sysctl_sched_stat_granularity __read_mostly; /* - * Initialized in sched_init_granularity(): + * Initialized in sched_init_granularity() [to 5 times the base granularity]: */ unsigned int sysctl_sched_runtime_limit __read_mostly; @@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static inline void @@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + + schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) */ /* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_min_granularity); + } + + return gran; +} + +/* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: */ @@ -240,7 +303,7 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity) /* * It will always fit into 'long': */ - return (long) (tmp >> WMULT_SHIFT); + return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); } static inline void @@ -303,10 +366,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { - delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); - delta = calc_delta_mine(delta, curr->load.weight, lw); - delta = min((u64)delta, cfs_rq->sleeper_bonus); + if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { + delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); + delta = min(delta, (unsigned long)( + (long)sysctl_sched_runtime_limit - curr->wait_runtime)); cfs_rq->sleeper_bonus -= delta; delta_mine -= delta; } @@ -438,6 +501,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long delta_fair; + if (unlikely(!se->wait_start_fair)) + return; + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->wait_start_fair)); @@ -494,6 +560,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) unsigned long load = cfs_rq->load.weight, delta_fair; long prev_runtime; + /* + * Do not boost sleepers if there's too much bonus 'in flight' + * already: + */ + if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) + return; + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) load = rq_of(cfs_rq)->cpu_load[2]; @@ -519,10 +592,6 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) * Track the amount of bonus we've given to sleepers: */ cfs_rq->sleeper_bonus += delta_fair; - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -570,6 +639,16 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->block_start = 0; se->sum_sleep_runtime += delta; + + /* + * Blocking time is in units of nanosecs, so shift by 20 to + * get a milliseconds-range estimation of the amount of + * time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), + delta >> 20); + } } #endif } @@ -604,7 +683,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) if (tsk->state & TASK_UNINTERRUPTIBLE) se->block_start = rq_of(cfs_rq)->clock; } - cfs_rq->wait_runtime -= se->wait_runtime; #endif } __dequeue_entity(cfs_rq, se); @@ -618,11 +696,31 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { s64 __delta = curr->fair_key - se->fair_key; + unsigned long ideal_runtime, delta_exec; + + /* + * ideal_runtime is compared against sum_exec_runtime, which is + * walltime, hence do not scale. + */ + ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, + (unsigned long)sysctl_sched_min_granularity); + + /* + * If we executed more than what the latency constraint suggests, + * reduce the rescheduling granularity. This way the total latency + * of how much a task is not scheduled converges to + * sysctl_sched_latency: + */ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + granularity = 0; /* * Take scheduling granularity into account - do not * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: + * + * scale granularity as key space is in fair_clock. */ if (__delta > niced_granularity(curr, granularity)) resched_task(rq_of(cfs_rq)->curr); @@ -641,6 +739,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end(cfs_rq, se); update_stats_curr_start(cfs_rq, se); set_cfs_rq_curr(cfs_rq, se); + se->prev_sum_exec_runtime = se->sum_exec_runtime; } static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) @@ -686,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -815,19 +915,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) } /* - * sched_yield() support is very simple - we dequeue and enqueue + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *rightmost, *se = &p->se; + struct rb_node *parent; - __update_rq_clock(rq); /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0); + enqueue_entity(cfs_rq, &p->se, 0); + + return; + } + /* + * Find the rightmost entry in the rbtree: + */ + do { + parent = *link; + link = &parent->rb_right; + } while (*link); + + rightmost = rb_entry(parent, struct sched_entity, run_node); + /* + * Already in the rightmost position? + */ + if (unlikely(rightmost == se)) + return; + + /* + * Minimally necessary key value to be last in the tree: */ - dequeue_entity(cfs_rq, &p->se, 0); - enqueue_entity(cfs_rq, &p->se, 0); + se->fair_key = rightmost->fair_key + 1; + + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + /* + * Relink the task to the rightmost position: + */ + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* @@ -1020,31 +1163,32 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se; + struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); sched_info_queued(p); + update_curr(cfs_rq); update_stats_enqueue(cfs_rq, se); /* * Child runs first: we let it run before the parent * until it reschedules once. We set up the key so that * it will preempt the parent: */ - p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + se->fair_key = curr->fair_key - + niced_granularity(curr, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: */ if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - p->se.wait_start_fair = 0; + se->wait_start_fair = 0; /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sysctl_sched_granularity / 2); + se->wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); } @@ -1057,7 +1201,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) */ static void set_curr_task_fair(struct rq *rq) { - struct sched_entity *se = &rq->curr.se; + struct sched_entity *se = &rq->curr->se; for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index dcdcad632fd..4b87476a02d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) return; p->time_slice = static_prio_timeslice(p->static_prio); - set_tsk_need_resched(p); - /* put it at the end of the queue: */ - requeue_task_rt(rq, p); + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->run_list.prev != p->run_list.next) { + requeue_task_rt(rq, p); + set_tsk_need_resched(p); + } } static struct sched_class rt_sched_class __read_mostly = { diff --git a/kernel/signal.c b/kernel/signal.c index b27c01a6644..79295238109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - if (tsk == current) - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); @@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } - if (likely(tsk == current)) - recalc_sigpending(); + recalc_sigpending(); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if ( signr && + if (signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ /* @@ -533,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - - error = -EPERM; - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && ((sig != SIGCONT) || - (process_session(current) != process_session(t))) - && (current->euid ^ t->suid) && (current->euid ^ t->uid) - && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_KILL)) + if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) + return error; + error = -EPERM; + if (((sig != SIGCONT) || + (process_session(current) != process_session(t))) + && (current->euid ^ t->suid) && (current->euid ^ t->uid) + && (current->uid ^ t->suid) && (current->uid ^ t->uid) + && !capable(CAP_KILL)) return error; + } return security_task_kill(t, info, sig, 0); } @@ -1300,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void) void sigqueue_free(struct sigqueue *q) { unsigned long flags; + spinlock_t *lock = ¤t->sighand->siglock; + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* * If the signal is still pending remove it from the - * pending queue. + * pending queue. We must hold ->siglock while testing + * q->list to serialize with collect_signal(). */ - if (unlikely(!list_empty(&q->list))) { - spinlock_t *lock = ¤t->sighand->siglock; - read_lock(&tasklist_lock); - spin_lock_irqsave(lock, flags); - if (!list_empty(&q->list)) - list_del_init(&q->list); - spin_unlock_irqrestore(lock, flags); - read_unlock(&tasklist_lock); - } + spin_lock_irqsave(lock, flags); + if (!list_empty(&q->list)) + list_del_init(&q->list); + spin_unlock_irqrestore(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; __sigqueue_free(q); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f546ddea43..bd89bc4eb0b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -271,8 +271,6 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } -EXPORT_SYMBOL(do_softirq); - #endif /* @@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr) wakeup_softirqd(); } -EXPORT_SYMBOL(raise_softirq_irqoff); - void fastcall raise_softirq(unsigned int nr) { unsigned long flags; diff --git a/kernel/sys.c b/kernel/sys.c index 449b81b98b3..8ae2e636eb1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> +#include <linux/cpu.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -878,6 +879,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); + disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -1442,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf) * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. * LBT 04.03.94 */ - asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; @@ -1470,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->real_parent == group_leader) { + if (p->real_parent->tgid == group_leader->tgid) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bdb8c07e04..6c97259e863 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include <linux/capability.h> #include <linux/ctype.h> #include <linux/utsname.h> -#include <linux/capability.h> #include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> @@ -223,8 +222,19 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -284,6 +294,23 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, @@ -305,15 +332,6 @@ static ctl_table kern_table[] = { }, #endif { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, @@ -1035,7 +1053,7 @@ static ctl_table vm_table[] = { .strategy = &sysctl_string, }, #endif -#if defined(CONFIG_X86_32) || \ +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .ctl_name = VM_VDSO_ENABLED, @@ -1203,7 +1221,7 @@ static ctl_table fs_table[] = { }; static ctl_table debug_table[] = { -#ifdef CONFIG_X86 +#if defined(CONFIG_X86) || defined(CONFIG_PPC) { .ctl_name = CTL_UNNUMBERED, .procname = "exception-trace", diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6635112654..8d53106a0a9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -23,3 +23,8 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 99b6034fc86..905b0b50792 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,6 @@ obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 41dd3105ce7..822beebe664 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -194,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events */ @@ -222,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - +#endif diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index cd91237dbfe..de6a2d6b3eb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -226,7 +226,7 @@ static void sync_cmos_clock(unsigned long dummy) static void notify_cmos_timer(void) { - if (no_sync_cmos_clock) + if (!no_sync_cmos_clock) mod_timer(&sync_cmos_timer, jiffies + 1); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index db8e0f3d409..fc3fc79b3d5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -64,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if (tick_broadcast_device.evtdev || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if ((tick_broadcast_device.evtdev && + tick_broadcast_device.evtdev->rating >= dev->rating) || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; clockevents_exchange_device(NULL, dev); @@ -176,8 +177,6 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - dev->next_event.tv64 = KTIME_MAX; - tick_do_periodic_broadcast(); /* @@ -218,26 +217,43 @@ static void tick_do_broadcast_on_off(void *why) bc = tick_broadcast_device.evtdev; /* - * Is the device in broadcast mode forever or is it not - * affected by the powerstate ? + * Is the device not affected by the powerstate ? */ - if (!dev || !tick_device_is_functional(dev) || - !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) goto out; - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_ON) { + /* + * Defect device ? + */ + if (!tick_device_is_functional(dev)) { + /* + * AMD C1E wreckage fixup: + * + * Device was registered functional in the first + * place. Now the secondary CPU detected the C1E + * misfeature and notifies us to fix it up + */ + if (*reason != CLOCK_EVT_NOTIFY_BROADCAST_FORCE) + goto out; + } + + switch (*reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); } - } else { + break; + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: if (cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } + break; } if (cpus_empty(tick_broadcast_mask)) @@ -383,11 +399,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - if(!cpus_empty(tick_broadcast_oneshot_mask)) - tick_broadcast_set_event(ktime_get(), 1); - - return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask); + return 0; } /* @@ -519,11 +531,9 @@ static void tick_broadcast_clear_oneshot(int cpu) */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; - } + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; } /* @@ -549,20 +559,17 @@ void tick_broadcast_switch_to_oneshot(void) */ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { - struct clock_event_device *bc; unsigned long flags; unsigned int cpu = *cpup; spin_lock_irqsave(&tick_broadcast_lock, flags); - bc = tick_broadcast_device.evtdev; + /* + * Clear the broadcast mask flag for the dead cpu, but do not + * stop the broadcast device! + */ cpu_clear(cpu, tick_broadcast_oneshot_mask); - if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { - if (bc && cpus_empty(tick_broadcast_oneshot_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 77a21abc871..1bea399a9ef 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) cpu = smp_processor_id(); if (!cpu_isset(cpu, newdev->cpumask)) - goto out; + goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -265,7 +265,7 @@ out_bc: */ if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; -out: + spin_unlock_irqrestore(&tick_device_lock, flags); return ret; @@ -345,6 +345,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: tick_broadcast_on_off(reason, dev); break; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b416995b975..8c3fef1db09 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + } + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index acc417b5a9b..4ad79f6bdec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -217,6 +217,7 @@ static void change_clocksource(void) } #else static inline void change_clocksource(void) { } +static inline s64 __get_nsec_offset(void) { return 0; } #endif /** @@ -280,6 +281,8 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; +/* xtime offset when we went into suspend */ +static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -305,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } + /* Make sure that we have the correct xtime reference */ + timespec_add_ns(&xtime, timekeeping_suspend_nsecs); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; @@ -325,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + timekeeping_suspend_time = read_persistent_clock(); + write_seqlock_irqsave(&xtime_lock, flags); + /* Get the current xtime offset */ + timekeeping_suspend_nsecs = __get_nsec_offset(); timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 3c38fb5eae1..c36bb7ed030 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v) ms = 1; if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, - events / period.tv_sec, events * 1000 / ms); + seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", + events, events * 1000 / ms, + (events * 1000000 / ms) % 1000); else seq_printf(m, "%ld total events\n", events); diff --git a/kernel/user.c b/kernel/user.c index e7d11cef699..9ca2848fc35 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,25 +55,22 @@ struct user_struct root_user = { /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { - list_add(&up->uidhash_list, hashent); + hlist_add_head(&up->uidhash_node, hashent); } static inline void uid_hash_remove(struct user_struct *up) { - list_del(&up->uidhash_list); + hlist_del_init(&up->uidhash_node); } -static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) { - struct list_head *up; - - list_for_each(up, hashent) { - struct user_struct *user; - - user = list_entry(up, struct user_struct, uidhash_list); + struct user_struct *user; + struct hlist_node *h; + hlist_for_each_entry(user, h, hashent, uidhash_node) { if(user->uid == uid) { atomic_inc(&user->__count); return user; @@ -122,7 +119,7 @@ void free_uid(struct user_struct *up) struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(ns, uid); + struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -202,6 +199,30 @@ void switch_uid(struct user_struct *new_user) suid_keys(current); } +void release_uids(struct user_namespace *ns) +{ + int i; + unsigned long flags; + struct hlist_head *head; + struct hlist_node *nd; + + spin_lock_irqsave(&uidhash_lock, flags); + /* + * collapse the chains so that the user_struct-s will + * be still alive, but not in hashes. subsequent free_uid() + * will free them. + */ + for (i = 0; i < UIDHASH_SZ; i++) { + head = ns->uidhash_table + i; + while (!hlist_empty(head)) { + nd = head->first; + hlist_del_init(nd); + } + } + spin_unlock_irqrestore(&uidhash_lock, flags); + + free_uid(ns->root_user); +} static int __init uid_cache_init(void) { @@ -211,7 +232,7 @@ static int __init uid_cache_init(void) 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(init_user_ns.uidhash_table + n); + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d055d987850..7af90fc4f0f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) kref_init(&ns->kref); for (n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(ns->uidhash_table + n); + INIT_HLIST_HEAD(ns->uidhash_table + n); /* Insert new root user. */ ns->root_user = alloc_uid(ns, 0); @@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); + release_uids(ns); kfree(ns); } diff --git a/kernel/utsname.c b/kernel/utsname.c index 9d8180a0f0d..816d7b24fa0 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) if (!ns) return ERR_PTR(-ENOMEM); + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + up_read(&uts_sem); kref_init(&ns->kref); return ns; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6b..e080d1d744c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -635,7 +635,7 @@ int keventd_up(void) int current_is_keventd(void) { struct cpu_workqueue_struct *cwq; - int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ int ret = 0; BUG_ON(!keventd_wq); |