diff options
Diffstat (limited to 'kernel')
71 files changed, 3643 insertions, 2175 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c1420..6b066632e40 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -63,3 +63,6 @@ config PREEMPT_BKL Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. +config PREEMPT_NOTIFIERS + bool + diff --git a/kernel/acct.c b/kernel/acct.c index 70d0d88e555..24f0f8b2ba7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -468,7 +468,7 @@ static void do_acct_process(struct file *file) } #endif do_div(elapsed, AHZ); - ac.ac_btime = xtime.tv_sec - elapsed; + ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; diff --git a/kernel/audit.c b/kernel/audit.c index eb0f9165b40..2924251a654 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -847,18 +847,10 @@ static void audit_receive_skb(struct sk_buff *skb) } /* Receive messages from netlink socket. */ -static void audit_receive(struct sock *sk, int length) +static void audit_receive(struct sk_buff *skb) { - struct sk_buff *skb; - unsigned int qlen; - mutex_lock(&audit_cmd_mutex); - - for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { - skb = skb_dequeue(&sk->sk_receive_queue); - audit_receive_skb(skb); - kfree_skb(skb); - } + audit_receive_skb(skb); mutex_unlock(&audit_cmd_mutex); } @@ -876,8 +868,8 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, - NULL, THIS_MODULE); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, + audit_receive, NULL, THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 1bf093dcffe..359645cff5b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -304,7 +304,7 @@ int __init audit_register_class(int class, unsigned *list) int audit_match_class(int class, unsigned syscall) { - if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32))) + if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) return 0; if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) return 0; @@ -456,6 +456,13 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == AUDIT_BIT_MASK || + f->op == AUDIT_BIT_TEST) { + err = -EINVAL; + goto exit_free; + } + break; case AUDIT_ARG0: case AUDIT_ARG1: case AUDIT_ARG2: @@ -1566,6 +1573,10 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return (left > right); case AUDIT_GREATER_THAN_OR_EQUAL: return (left >= right); + case AUDIT_BIT_MASK: + return (left & right); + case AUDIT_BIT_TEST: + return ((left & right) == right); } BUG(); return 0; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b7640a5f382..04f3ffb8d9d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -153,7 +153,7 @@ struct audit_aux_data_execve { struct audit_aux_data d; int argc; int envc; - char mem[0]; + struct mm_struct *mm; }; struct audit_aux_data_socketcall { @@ -173,12 +173,6 @@ struct audit_aux_data_fd_pair { int fd[2]; }; -struct audit_aux_data_path { - struct audit_aux_data d; - struct dentry *dentry; - struct vfsmount *mnt; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -654,12 +648,6 @@ static inline void audit_free_aux(struct audit_context *context) struct audit_aux_data *aux; while ((aux = context->aux)) { - if (aux->type == AUDIT_AVC_PATH) { - struct audit_aux_data_path *axi = (void *)aux; - dput(axi->dentry); - mntput(axi->mnt); - } - context->aux = aux->next; kfree(aux); } @@ -831,6 +819,57 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } +static void audit_log_execve_info(struct audit_buffer *ab, + struct audit_aux_data_execve *axi) +{ + int i; + long len, ret; + const char __user *p; + char *buf; + + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + p = (const char __user *)axi->mm->arg_start; + + for (i = 0; i < axi->argc; i++, p += len) { + len = strnlen_user(p, MAX_ARG_STRLEN); + /* + * We just created this mm, if we can't find the strings + * we just copied into it something is _very_ wrong. Similar + * for strings that are too long, we should not have created + * any. + */ + if (!len || len > MAX_ARG_STRLEN) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + audit_panic("out of memory for argv string\n"); + break; + } + + ret = copy_from_user(buf, p, len); + /* + * There is no reason for this copy to be short. We just + * copied them here, and the mm hasn't been exposed to user- + * space yet. + */ + if (ret) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + } + + audit_log_format(ab, "a%d=", i); + audit_log_untrustedstring(ab, buf); + audit_log_format(ab, "\n"); + + kfree(buf); + } +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { int i, call_panic = 0; @@ -946,7 +985,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - "ouid=%u ogid=%u mode=%x", + "ouid=%u ogid=%u mode=%#o", axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; @@ -965,19 +1004,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - "qbytes=%lx ouid=%u ogid=%u mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%#o", axi->qbytes, axi->uid, axi->gid, axi->mode); break; } case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; - int i; - const char *p; - for (i = 0, p = axi->mem; i < axi->argc; i++) { - audit_log_format(ab, "a%d=", i); - p = audit_log_untrustedstring(ab, p); - audit_log_format(ab, "\n"); - } + audit_log_execve_info(ab, axi); break; } case AUDIT_SOCKETCALL: { @@ -995,11 +1028,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_hex(ab, axs->a, axs->len); break; } - case AUDIT_AVC_PATH: { - struct audit_aux_data_path *axi = (void *)aux; - audit_log_d_path(ab, "path=", axi->dentry, axi->mnt); - break; } - case AUDIT_FD_PAIR: { struct audit_aux_data_fd_pair *axs = (void *)aux; audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); @@ -1821,32 +1849,31 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode return 0; } +int audit_argv_kb = 32; + int audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - unsigned long p, next; - void *to; if (likely(!audit_enabled || !context || context->dummy)) return 0; - ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, - GFP_KERNEL); + /* + * Even though the stack code doesn't limit the arg+env size any more, + * the audit code requires that _all_ arguments be logged in a single + * netlink skb. Hence cap it :-( + */ + if (bprm->argv_len > (audit_argv_kb << 10)) + return -E2BIG; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; ax->argc = bprm->argc; ax->envc = bprm->envc; - for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { - struct page *page = bprm->page[p / PAGE_SIZE]; - void *kaddr = kmap(page); - next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); - memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); - to += next - p; - kunmap(page); - } - + ax->mm = bprm->mm; ax->d.type = AUDIT_EXECVE; ax->d.next = context->aux; context->aux = (void *)ax; @@ -1949,36 +1976,6 @@ void __audit_ptrace(struct task_struct *t) } /** - * audit_avc_path - record the granting or denial of permissions - * @dentry: dentry to record - * @mnt: mnt to record - * - * Returns 0 for success or NULL context or < 0 on error. - * - * Called from security/selinux/avc.c::avc_audit() - */ -int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) -{ - struct audit_aux_data_path *ax; - struct audit_context *context = current->audit_context; - - if (likely(!context)) - return 0; - - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); - if (!ax) - return -ENOMEM; - - ax->dentry = dget(dentry); - ax->mnt = mntget(mnt); - - ax->d.type = AUDIT_AVC_PATH; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; -} - -/** * audit_signal_info - record signal info for shutting down audit subsystem * @sig: signal value * @t: task being signaled @@ -1995,19 +1992,19 @@ int __audit_signal_info(int sig, struct task_struct *t) extern uid_t audit_sig_uid; extern u32 audit_sig_sid; - if (audit_pid && t->tgid == audit_pid && - (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1)) { - audit_sig_pid = tsk->pid; - if (ctx) - audit_sig_uid = ctx->loginuid; - else - audit_sig_uid = tsk->uid; - selinux_get_task_sid(tsk, &audit_sig_sid); + if (audit_pid && t->tgid == audit_pid) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + audit_sig_pid = tsk->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = tsk->uid; + selinux_get_task_sid(tsk, &audit_sig_sid); + } + if (!audit_signals || audit_dummy_context()) + return 0; } - if (!audit_signals) /* audit_context checked in wrapper */ - return 0; - /* optimize the common case by putting first signal recipient directly * in audit_context */ if (!ctx->target_pid) { @@ -2026,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } - BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); + BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = t->tgid; selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); diff --git a/kernel/cpu.c b/kernel/cpu.c index 181ae708602..38033db8d8e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu) return err; } -#ifdef CONFIG_SUSPEND_SMP +#ifdef CONFIG_PM_SLEEP_SMP static cpumask_t frozen_cpus; int disable_nonboot_cpus(void) @@ -334,4 +334,4 @@ void enable_nonboot_cpus(void) out: mutex_unlock(&cpu_add_remove_lock); } -#endif +#endif /* CONFIG_PM_SLEEP_SMP */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b4796d85014..57e6448b171 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -516,7 +516,7 @@ static void cpuset_release_agent(const char *pathbuf) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); kfree(pathbuf); } diff --git a/kernel/exit.c b/kernel/exit.c index e8af8d0c248..993369ee94d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -24,7 +24,6 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> -#include <linux/signalfd.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> @@ -45,6 +44,7 @@ #include <linux/resource.h> #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> +#include <linux/freezer.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -85,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); - /* - * Notify that this sighand has been detached. This must - * be called with the tsk->sighand lock held. Also, this - * access tsk->sighand internally, so it must be called - * before tsk->sighand is reset. - */ - signalfd_detach_locked(tsk); - posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) posix_cpu_timers_exit_group(tsk); @@ -594,6 +586,8 @@ static void exit_mm(struct task_struct * tsk) tsk->mm = NULL; up_read(&mm->mmap_sem); enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); task_unlock(tsk); mmput(mm); } @@ -810,7 +804,7 @@ static void exit_notify(struct task_struct *tsk) __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } - /* Let father know we died + /* Let father know we died * * Thread signals are configurable, but you aren't going to use * that to send signals to arbitary processes. @@ -823,9 +817,7 @@ static void exit_notify(struct task_struct *tsk) * If our self_exec id doesn't match our parent_exec_id then * we have changed execution domain as these two values started * the same after a fork. - * */ - if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && ( tsk->parent_exec_id != t->self_exec_id || tsk->self_exec_id != tsk->parent_exec_id) @@ -845,9 +837,7 @@ static void exit_notify(struct task_struct *tsk) } state = EXIT_ZOMBIE; - if (tsk->exit_signal == -1 && - (likely(tsk->ptrace == 0) || - unlikely(tsk->parent->signal->flags & SIGNAL_GROUP_EXIT))) + if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) state = EXIT_DEAD; tsk->exit_state = state; @@ -976,6 +966,7 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(tsk); @@ -997,7 +988,6 @@ fastcall NORET_TYPE void do_exit(long code) if (tsk->binfmt) module_put(tsk->binfmt->module); - tsk->exit_code = code; proc_exit_connector(tsk); exit_task_namespaces(tsk); exit_notify(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index ba39bdb2a7b..5e67f90a169 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -137,7 +137,7 @@ void __init fork_init(unsigned long mempages) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", sizeof(struct task_struct), - ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL, NULL); + ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif /* @@ -334,6 +334,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); + mm->flags = (current->mm) ? current->mm->flags + : MMF_DUMP_FILTER_DEFAULT; mm->core_waiters = 0; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); @@ -1436,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); - INIT_LIST_HEAD(&sighand->signalfd_list); + init_waitqueue_head(&sighand->signalfd_wqh); } void __init proc_caches_init(void) @@ -1444,22 +1446,22 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, - sighand_ctor, NULL); + sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - files_cachep = kmem_cache_create("files_cache", + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); - fs_cachep = kmem_cache_create("fs_cache", + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), 0, - SLAB_PANIC, NULL, NULL); + SLAB_PANIC, NULL); mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } /* @@ -1606,7 +1608,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER)) + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| + CLONE_NEWNET)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) diff --git a/kernel/futex.c b/kernel/futex.c index 5c3f45d07c5..fcc94e7b408 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -346,15 +346,20 @@ static int futex_handle_fault(unsigned long address, vma = find_vma(mm, address); if (vma && address >= vma->vm_start && (vma->vm_flags & VM_WRITE)) { - switch (handle_mm_fault(mm, vma, address, 1)) { - case VM_FAULT_MINOR: - ret = 0; - current->min_flt++; - break; - case VM_FAULT_MAJOR: + int fault; + fault = handle_mm_fault(mm, vma, address, 1); + if (unlikely((fault & VM_FAULT_ERROR))) { +#if 0 + /* XXX: let's do this when we verify it is OK */ + if (ret & VM_FAULT_OOM) + ret = -ENOMEM; +#endif + } else { ret = 0; - current->maj_flt++; - break; + if (fault & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; } } if (!fshared) @@ -1665,6 +1670,7 @@ pi_faulted: attempt); if (ret) goto out; + uval = 0; goto retry_unlocked; } @@ -1937,9 +1943,10 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -1959,12 +1966,14 @@ void exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); - + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* * A pending lock might already be on the list, so * don't process it twice: */ @@ -1972,11 +1981,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1985,6 +1993,10 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, @@ -2055,8 +2067,10 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, } /* * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index f7921360efa..2c2e2954b71 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -38,10 +38,11 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; - compat_uptr_t uentry, upending; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; + int rc; /* * Fetch the list head (which was registered earlier, via @@ -61,10 +62,15 @@ void compat_exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&upending, &pending, &head->list_op_pending, &pip)) return; - if (upending) - handle_futex_death((void __user *)pending + futex_offset, curr, pip); - while (compat_ptr(uentry) != &head->list) { + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); /* * A pending lock might already be on the list, so * dont process it twice: @@ -74,12 +80,11 @@ void compat_exit_robust_list(struct task_struct *curr) curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&uentry, &entry, - (compat_uptr_t __user *)&entry->next, &pi)) + if (rc) return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -88,6 +93,9 @@ void compat_exit_robust_list(struct task_struct *curr) cond_resched(); } + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } asmlinkage long diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72d034258ba..dc8a4451d79 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -141,11 +141,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); -#ifdef CONFIG_NO_HZ - getnstimeofday(&xts); -#else - xts = xtime; -#endif + xts = current_kernel_time(); tom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); @@ -281,6 +277,30 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) } EXPORT_SYMBOL_GPL(ktime_add_ns); + +/** + * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable + * @kt: minuend + * @nsec: the scalar nsec value to subtract + * + * Returns the subtraction of @nsec from @kt in ktime_t format + */ +ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_sub(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_sub_ns); # endif /* !CONFIG_KTIME_SCALAR */ /* @@ -558,7 +578,8 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ static int hrtimer_switch_to_hres(void) { - struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + int cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; if (base->hres_active) @@ -568,6 +589,8 @@ static int hrtimer_switch_to_hres(void) if (tick_init_highres()) { local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); return 0; } base->hres_active = 1; @@ -683,6 +706,7 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct rb_node **link = &base->active.rb_node; struct rb_node *parent = NULL; struct hrtimer *entry; + int leftmost = 1; /* * Find the right place in the rbtree: @@ -694,18 +718,19 @@ static void enqueue_hrtimer(struct hrtimer *timer, * We dont care about collisions. Nodes with * the same expiry time stay together. */ - if (timer->expires.tv64 < entry->expires.tv64) + if (timer->expires.tv64 < entry->expires.tv64) { link = &(*link)->rb_left; - else + } else { link = &(*link)->rb_right; + leftmost = 0; + } } /* * Insert the timer to the rbtree and check whether it * replaces the first pending timer */ - if (!base->first || timer->expires.tv64 < - rb_entry(base->first, struct hrtimer, node)->expires.tv64) { + if (leftmost) { /* * Reprogram the clock event device. When the timer is already * expired hrtimer_enqueue_reprogram has either called the diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 615ce97c6cf..f1a73f0b54e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -352,13 +352,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * keep it masked and get out of here */ action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { - desc->status |= IRQ_PENDING; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) goto out_unlock; - } desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d8ee241115f..6d9204f3a37 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/device.h> /* * Device resource management aware IRQ request/free implementation. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 203a518b6f1..7230d914eaa 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id) * We do this after actually deregistering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ + local_irq_save(flags); handler(irq, dev_id); + local_irq_restore(flags); } #endif } @@ -545,14 +547,11 @@ int request_irq(unsigned int irq, irq_handler_t handler, * We do this before actually registering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ - if (irqflags & IRQF_DISABLED) { - unsigned long flags; + unsigned long flags; - local_irq_save(flags); - handler(irq, dev_id); - local_irq_restore(flags); - } else - handler(irq, dev_id); + local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); } #endif diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b4f1674fca7..50b81b98046 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,15 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); + struct irq_desc *desc = irq_desc + (long)data; + cpumask_t *mask = &desc->affinity; + int len; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PENDING) + mask = &desc->pending_mask; +#endif + len = cpumask_scnprintf(page, count, *mask); if (count - len < 2) return -EINVAL; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 5bfeaed7e48..a8046791ba2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -62,7 +62,12 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) */ desc->chip->enable(irq); - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + /* + * We do not resend level type interrupts. Level type + * interrupts are resent by hardware when they are still + * active. + */ + if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; if (!desc->chip || !desc->chip->retrigger || diff --git a/kernel/kmod.c b/kernel/kmod.c index 4d32eb07717..c6a4f8aebeb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -33,6 +33,8 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> +#include <linux/notifier.h> +#include <linux/suspend.h> #include <asm/uaccess.h> extern int max_threads; @@ -119,9 +121,10 @@ struct subprocess_info { char **argv; char **envp; struct key *ring; - int wait; + enum umh_wait wait; int retval; struct file *stdin; + void (*cleanup)(char **argv, char **envp); }; /* @@ -180,6 +183,14 @@ static int ____call_usermodehelper(void *data) do_exit(0); } +void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info->argv, info->envp); + kfree(info); +} +EXPORT_SYMBOL(call_usermodehelper_freeinfo); + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -216,8 +227,8 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - if (sub_info->wait < 0) - kfree(sub_info); + if (sub_info->wait == UMH_NO_WAIT) + call_usermodehelper_freeinfo(sub_info); else complete(sub_info->complete); return 0; @@ -229,34 +240,204 @@ static void __call_usermodehelper(struct work_struct *work) struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); pid_t pid; - int wait = sub_info->wait; + enum umh_wait wait = sub_info->wait; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (wait) + if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else pid = kernel_thread(____call_usermodehelper, sub_info, CLONE_VFORK | SIGCHLD); - if (wait < 0) - return; + switch (wait) { + case UMH_NO_WAIT: + break; - if (pid < 0) { + case UMH_WAIT_PROC: + if (pid > 0) + break; sub_info->retval = pid; + /* FALLTHROUGH */ + + case UMH_WAIT_EXEC: complete(sub_info->complete); - } else if (!wait) - complete(sub_info->complete); + } +} + +#ifdef CONFIG_PM +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + */ +static int usermodehelper_disabled; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_pm_callback() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +static int usermodehelper_pm_callback(struct notifier_block *nfb, + unsigned long action, + void *ignored) +{ + long retval; + + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) { + return NOTIFY_OK; + } else { + usermodehelper_disabled = 0; + return NOTIFY_BAD; + } + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + usermodehelper_disabled = 0; + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic_inc(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +static void register_pm_notifier_callback(void) +{ + pm_notifier(usermodehelper_pm_callback, 0); } +#else /* CONFIG_PM */ +#define usermodehelper_disabled 0 + +static inline void helper_lock(void) {} +static inline void helper_unlock(void) {} +static inline void register_pm_notifier_callback(void) {} +#endif /* CONFIG_PM */ /** - * call_usermodehelper_keys - start a usermode application - * @path: pathname for the application - * @argv: null-terminated argument list - * @envp: null-terminated environment list - * @session_keyring: session keyring for process (NULL for an empty keyring) + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + */ +struct subprocess_info *call_usermodehelper_setup(char *path, + char **argv, char **envp) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_setkeys - set the session keys for usermode helper + * @info: a subprocess_info returned by call_usermodehelper_setup + * @session_keyring: the session keyring for the process + */ +void call_usermodehelper_setkeys(struct subprocess_info *info, + struct key *session_keyring) +{ + info->ring = session_keyring; +} +EXPORT_SYMBOL(call_usermodehelper_setkeys); + +/** + * call_usermodehelper_setcleanup - set a cleanup function + * @info: a subprocess_info returned by call_usermodehelper_setup + * @cleanup: a cleanup function + * + * The cleanup function is just befor ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +void call_usermodehelper_setcleanup(struct subprocess_info *info, + void (*cleanup)(char **argv, char **envp)) +{ + info->cleanup = cleanup; +} +EXPORT_SYMBOL(call_usermodehelper_setcleanup); + +/** + * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin + * @sub_info: a subprocess_info returned by call_usermodehelper_setup + * @filp: set to the write-end of a pipe + * + * This constructs a pipe, and sets the read end to be the stdin of the + * subprocess, and returns the write-end in *@filp. + */ +int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, + struct file **filp) +{ + struct file *f; + + f = create_write_pipe(); + if (IS_ERR(f)) + return PTR_ERR(f); + *filp = f; + + f = create_read_pipe(f); + if (IS_ERR(f)) { + free_write_pipe(*filp); + return PTR_ERR(f); + } + sub_info->stdin = f; + + return 0; +} +EXPORT_SYMBOL(call_usermodehelper_stdinpipe); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa * @wait: wait for the application to finish and return status. * when -1 don't wait at all, but you get no useful error back when * the program couldn't be exec'ed. This makes it safe to call @@ -265,81 +446,70 @@ static void __call_usermodehelper(struct work_struct *work) * Runs a user-space application. The application is started * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). - * - * Must be called from process context. Returns a negative error code - * if program was not execed successfully, or 0. */ -int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, int wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, + enum umh_wait wait) { DECLARE_COMPLETION_ONSTACK(done); - struct subprocess_info *sub_info; int retval; - if (!khelper_wq) - return -EBUSY; - - if (path[0] == '\0') - return 0; + helper_lock(); + if (sub_info->path[0] == '\0') { + retval = 0; + goto out; + } - sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); - if (!sub_info) - return -ENOMEM; + if (!khelper_wq || usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } - INIT_WORK(&sub_info->work, __call_usermodehelper); sub_info->complete = &done; - sub_info->path = path; - sub_info->argv = argv; - sub_info->envp = envp; - sub_info->ring = session_keyring; sub_info->wait = wait; queue_work(khelper_wq, &sub_info->work); - if (wait < 0) /* task has freed sub_info */ + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ return 0; wait_for_completion(&done); retval = sub_info->retval; - kfree(sub_info); + + out: + call_usermodehelper_freeinfo(sub_info); + helper_unlock(); return retval; } -EXPORT_SYMBOL(call_usermodehelper_keys); +EXPORT_SYMBOL(call_usermodehelper_exec); +/** + * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @filp: set to the write-end of a pipe + * + * This is a simple wrapper which executes a usermode-helper function + * with a pipe as stdin. It is implemented entirely in terms of + * lower-level call_usermodehelper_* functions. + */ int call_usermodehelper_pipe(char *path, char **argv, char **envp, struct file **filp) { - DECLARE_COMPLETION(done); - struct subprocess_info sub_info = { - .work = __WORK_INITIALIZER(sub_info.work, - __call_usermodehelper), - .complete = &done, - .path = path, - .argv = argv, - .envp = envp, - .retval = 0, - }; - struct file *f; - - if (!khelper_wq) - return -EBUSY; + struct subprocess_info *sub_info; + int ret; - if (path[0] == '\0') - return 0; + sub_info = call_usermodehelper_setup(path, argv, envp); + if (sub_info == NULL) + return -ENOMEM; - f = create_write_pipe(); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; + ret = call_usermodehelper_stdinpipe(sub_info, filp); + if (ret < 0) + goto out; - f = create_read_pipe(f); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info.stdin = f; + return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - queue_work(khelper_wq, &sub_info.work); - wait_for_completion(&done); - return sub_info.retval; + out: + call_usermodehelper_freeinfo(sub_info); + return ret; } EXPORT_SYMBOL(call_usermodehelper_pipe); @@ -347,4 +517,5 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); + register_pm_notifier_callback(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47d8c493f..4b8a4493c54 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -675,9 +675,18 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to be notified first */ }; +unsigned long __weak arch_deref_entry_point(void *entry) +{ + return (unsigned long)entry; +} int __kprobes register_jprobe(struct jprobe *jp) { + unsigned long addr = arch_deref_entry_point(jp->entry); + + if (!kernel_text_address(addr)) + return -EINVAL; + /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; @@ -1054,6 +1063,11 @@ EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); +#ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(jprobe_return); +#endif + +#ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); +#endif diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 559deca5ed1..d0e5c48e18c 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -62,6 +62,28 @@ static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) KERNEL_ATTR_RO(kexec_crash_loaded); #endif /* CONFIG_KEXEC */ +/* + * Make /sys/kernel/notes give the raw contents of our kernel .notes section. + */ +extern const void __start_notes __attribute__((weak)); +extern const void __stop_notes __attribute__((weak)); +#define notes_size (&__stop_notes - &__start_notes) + +static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, &__start_notes + off, count); + return count; +} + +static struct bin_attribute notes_attr = { + .attr = { + .name = "notes", + .mode = S_IRUGO, + }, + .read = ¬es_read, +}; + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -88,6 +110,12 @@ static int __init ksysfs_init(void) error = sysfs_create_group(&kernel_subsys.kobj, &kernel_attr_group); + if (!error && notes_size > 0) { + notes_attr.size = notes_size; + error = sysfs_create_bin_file(&kernel_subsys.kobj, + ¬es_attr); + } + return error; } diff --git a/kernel/kthread.c b/kernel/kthread.c index a404f7ee739..dcfe724300e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -214,23 +214,15 @@ int kthread_stop(struct task_struct *k) } EXPORT_SYMBOL(kthread_stop); - -static noinline __init_refok void kthreadd_setup(void) +int kthreadd(void *unused) { struct task_struct *tsk = current; + /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); - ignore_signals(tsk); - set_user_nice(tsk, -5); set_cpus_allowed(tsk, CPU_MASK_ALL); -} - -int kthreadd(void *unused) -{ - /* Setup a clean context for our children to inherit. */ - kthreadd_setup(); current->flags |= PF_NOFREEZE; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index edba2ffb43d..734da579ad1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -5,7 +5,8 @@ * * Started by Ingo Molnar: * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: @@ -37,11 +38,26 @@ #include <linux/debug_locks.h> #include <linux/irqflags.h> #include <linux/utsname.h> +#include <linux/hash.h> #include <asm/sections.h> #include "lockdep_internals.h" +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -96,23 +112,6 @@ unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; /* - * Allocate a lockdep entry. (assumes the graph_lock held, returns - * with NULL on failure) - */ -static struct lock_list *alloc_list_entry(void) -{ - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return NULL; - - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - return list_entries + nr_list_entries++; -} - -/* * All data structures here are protected by the global debug_lock. * * Mutex key structs only get allocated, once during bootup, and never @@ -121,6 +120,117 @@ static struct lock_list *alloc_list_entry(void) unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); + +static int lock_contention_point(struct lock_class *class, unsigned long ip) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + if (class->contention_point[i] == 0) { + class->contention_point[i] = ip; + break; + } + if (class->contention_point[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, s64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->min) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ + dst->min += src->min; + dst->max += src->max; + dst->total += src->total; + dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ + struct lock_class_stats stats; + int cpu, i; + + memset(&stats, 0, sizeof(struct lock_class_stats)); + for_each_possible_cpu(cpu) { + struct lock_class_stats *pcs = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) + stats.contention_point[i] += pcs->contention_point[i]; + + lock_time_add(&pcs->read_waittime, &stats.read_waittime); + lock_time_add(&pcs->write_waittime, &stats.write_waittime); + + lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + + for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) + stats.bounces[i] += pcs->bounces[i]; + } + + return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct lock_class_stats *cpu_stats = + &per_cpu(lock_stats, cpu)[class - lock_classes]; + + memset(cpu_stats, 0, sizeof(struct lock_class_stats)); + } + memset(class->contention_point, 0, sizeof(class->contention_point)); +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + s64 holdtime; + + if (!lock_stat) + return; + + holdtime = sched_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock->class); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + /* * We keep a global list of all lock classes. The list only grows, * never shrinks. The list is only accessed with the lockdep @@ -133,24 +243,18 @@ LIST_HEAD(all_lock_classes); */ #define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) #define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) -#define CLASSHASH_MASK (CLASSHASH_SIZE - 1) -#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK) +#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) static struct list_head classhash_table[CLASSHASH_SIZE]; -unsigned long nr_lock_chains; -static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; - /* * We put the lock dependency chains into a hash-table as well, to cache * their existence: */ #define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) #define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) -#define CHAINHASH_MASK (CHAINHASH_SIZE - 1) -#define __chainhashfn(chain) \ - (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK) +#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) static struct list_head chainhash_table[CHAINHASH_SIZE]; @@ -223,26 +327,6 @@ static int verbose(struct lock_class *class) return 0; } -#ifdef CONFIG_TRACE_IRQFLAGS - -static int hardirq_verbose(struct lock_class *class) -{ -#if HARDIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int softirq_verbose(struct lock_class *class) -{ -#if SOFTIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#endif - /* * Stack-trace: tightly packed array of stack backtrace * addresses. Protected by the graph_lock. @@ -291,6 +375,11 @@ unsigned int max_recursion_depth; * about it later on, in lockdep_info(). */ static int lockdep_init_error; +static unsigned long lockdep_init_trace_data[20]; +static struct stack_trace lockdep_init_trace = { + .max_entries = ARRAY_SIZE(lockdep_init_trace_data), + .entries = lockdep_init_trace_data, +}; /* * Various lockdep statistics: @@ -482,6 +571,262 @@ static void print_lock_dependencies(struct lock_class *class, int depth) } } +static void print_kernel_version(void) +{ + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE + return class_filter(class); +#endif + return 0; +} + +/* + * Is this the address of a static object: + */ +static int static_obj(void *obj) +{ + unsigned long start = (unsigned long) &_stext, + end = (unsigned long) &_end, + addr = (unsigned long) obj; +#ifdef CONFIG_SMP + int i; +#endif + + /* + * static variable? + */ + if ((addr >= start) && (addr < end)) + return 1; + +#ifdef CONFIG_SMP + /* + * percpu var? + */ + for_each_possible_cpu(i) { + start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); + end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM + + per_cpu_offset(i); + + if ((addr >= start) && (addr < end)) + return 1; + } +#endif + + /* + * module var? + */ + return is_module_address(addr); +} + +/* + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: + */ +static int count_matching_names(struct lock_class *new_class) +{ + struct lock_class *class; + int count = 0; + + if (!new_class->name) + return 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + if (new_class->key - new_class->subclass == class->key) + return class->name_version; + if (class->name && !strcmp(class->name, new_class->name)) + count = max(count, class->name_version); + } + + return count + 1; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * If the architecture calls into lockdep before initializing + * the hashes then we'll warn about it later. (we cannot printk + * right now) + */ + if (unlikely(!lockdep_initialized)) { + lockdep_init(); + lockdep_init_error = 1; + save_stack_trace(&lockdep_init_trace); + } +#endif + + /* + * Static locks do not have their class-keys yet - for them the key + * is the lock object itself: + */ + if (unlikely(!lock->key)) + lock->key = (void *)lock; + + /* + * NOTE: the class-key must be unique. For dynamic locks, a static + * lock_class_key variable is passed in through the mutex_init() + * (or spin_lock_init()) call - which acts as the key. For static + * locks we use the lock object itself as the key. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > + sizeof(struct lockdep_map)); + + key = lock->key->subkeys + subclass; + + hash_head = classhashentry(key); + + /* + * We can walk the hash lockfree, because the hash only + * grows, and we are careful when adding entries to the end: + */ + list_for_each_entry(class, hash_head, hash_entry) { + if (class->key == key) { + WARN_ON_ONCE(class->name != lock->name); + return class; + } + } + + return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + unsigned long flags; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + return class; + + /* + * Debug-check: all keys must be persistent! + */ + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return NULL; + } + + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + /* + * We have to do the hash-walk again, to avoid races + * with another CPU: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + goto out_unlock_set; + /* + * Allocate a new key from the static array, and add it to + * the hash: + */ + if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + if (!debug_locks_off_graph_unlock()) { + raw_local_irq_restore(flags); + return NULL; + } + raw_local_irq_restore(flags); + + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + class = lock_classes + nr_lock_classes++; + debug_atomic_inc(&nr_unused_locks); + class->key = key; + class->name = lock->name; + class->subclass = subclass; + INIT_LIST_HEAD(&class->lock_entry); + INIT_LIST_HEAD(&class->locks_before); + INIT_LIST_HEAD(&class->locks_after); + class->name_version = count_matching_names(class); + /* + * We use RCU's safe list-add method to make + * parallel walking of the hash-list safe: + */ + list_add_tail_rcu(&class->hash_entry, hash_head); + + if (verbose(class)) { + graph_unlock(); + raw_local_irq_restore(flags); + + printk("\nnew class %p: %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + } +out_unlock_set: + graph_unlock(); + raw_local_irq_restore(flags); + + if (!subclass || force) + lock->class_cache = class; + + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; + + return class; +} + +#ifdef CONFIG_PROVE_LOCKING +/* + * Allocate a lockdep entry. (assumes the graph_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ + if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + if (!debug_locks_off_graph_unlock()) + return NULL; + + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + return NULL; + } + return list_entries + nr_list_entries++; +} + /* * Add a new dependency to the head of the list: */ @@ -542,13 +887,6 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) return 0; } -static void print_kernel_version(void) -{ - printk("%s %.*s\n", init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); -} - /* * When a circular dependency is detected, print the * header first: @@ -640,15 +978,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) return 1; } -static int very_verbose(struct lock_class *class) -{ -#if VERY_VERBOSE - return class_filter(class); -#endif - return 0; -} #ifdef CONFIG_TRACE_IRQFLAGS - /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency @@ -821,6 +1151,78 @@ check_usage(struct task_struct *curr, struct held_lock *prev, bit_backwards, bit_forwards, irqclass); } +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + /* + * Prove that the new dependency does not connect a hardirq-safe + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, + LOCK_ENABLED_HARDIRQS, "hard")) + return 0; + + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, + LOCK_ENABLED_HARDIRQS, "hard-read")) + return 0; + + /* + * Prove that the new dependency does not connect a softirq-safe + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + /* + * Prove that the new dependency does not connect a softirq-safe-read + * lock with a softirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ + if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, + LOCK_ENABLED_SOFTIRQS, "soft")) + return 0; + + return 1; +} + +static void inc_chains(void) +{ + if (current->hardirq_context) + nr_hardirq_chains++; + else { + if (current->softirq_context) + nr_softirq_chains++; + else + nr_process_chains++; + } +} + +#else + +static inline int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + return 1; +} + +static inline void inc_chains(void) +{ + nr_process_chains++; +} + #endif static int @@ -922,47 +1324,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, if (!(check_noncircular(next->class, 0))) return print_circular_bug_tail(); -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ, - LOCK_ENABLED_HARDIRQS, "hard")) + if (!check_prev_add_irq(curr, prev, next)) return 0; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ, - LOCK_ENABLED_HARDIRQS, "hard-read")) - return 0; - - /* - * Prove that the new dependency does not connect a softirq-safe - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; - /* - * Prove that the new dependency does not connect a softirq-safe-read - * lock with a softirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: - */ - if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ, - LOCK_ENABLED_SOFTIRQS, "soft")) - return 0; -#endif - /* * For recursive read-locks we do all the dependency checks, * but we dont store read-triggered dependencies (only * write-triggered dependencies). This ensures that only the @@ -1088,224 +1453,8 @@ out_bug: return 0; } - -/* - * Is this the address of a static object: - */ -static int static_obj(void *obj) -{ - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; -#ifdef CONFIG_SMP - int i; -#endif - - /* - * static variable? - */ - if ((addr >= start) && (addr < end)) - return 1; - -#ifdef CONFIG_SMP - /* - * percpu var? - */ - for_each_possible_cpu(i) { - start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); - end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM - + per_cpu_offset(i); - - if ((addr >= start) && (addr < end)) - return 1; - } -#endif - - /* - * module var? - */ - return is_module_address(addr); -} - -/* - * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: - */ -static int count_matching_names(struct lock_class *new_class) -{ - struct lock_class *class; - int count = 0; - - if (!new_class->name) - return 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (new_class->key - new_class->subclass == class->key) - return class->name_version; - if (class->name && !strcmp(class->name, new_class->name)) - count = max(count, class->name_version); - } - - return count + 1; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - } -#endif - - /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: - */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; - - /* - * NOTE: the class-key must be unique. For dynamic locks, a static - * lock_class_key variable is passed in through the mutex_init() - * (or spin_lock_init()) call - which acts as the key. For static - * locks we use the lock object itself as the key. - */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(struct lock_class)); - - key = lock->key->subkeys + subclass; - - hash_head = classhashentry(key); - - /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - return class; - - return NULL; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - unsigned long flags; - - class = look_up_lock_class(lock, subclass); - if (likely(class)) - return class; - - /* - * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return NULL; - } - - key = lock->key->subkeys + subclass; - hash_head = classhashentry(key); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - /* - * We have to do the hash-walk again, to avoid races - * with another CPU: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - goto out_unlock_set; - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); - return NULL; - } - raw_local_irq_restore(flags); - - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); - return NULL; - } - class = lock_classes + nr_lock_classes++; - debug_atomic_inc(&nr_unused_locks); - class->key = key; - class->name = lock->name; - class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); - class->name_version = count_matching_names(class); - /* - * We use RCU's safe list-add method to make - * parallel walking of the hash-list safe: - */ - list_add_tail_rcu(&class->hash_entry, hash_head); - - if (verbose(class)) { - graph_unlock(); - raw_local_irq_restore(flags); - - printk("\nnew class %p: %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - } -out_unlock_set: - graph_unlock(); - raw_local_irq_restore(flags); - - if (!subclass || force) - lock->class_cache = class; - - if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) - return NULL; - - return class; -} +unsigned long nr_lock_chains; +static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; /* * Look up a dependency chain. If the key is not present yet then @@ -1366,21 +1515,72 @@ cache_hit: chain->chain_key = chain_key; list_add_tail_rcu(&chain->entry, hash_head); debug_atomic_inc(&chain_lookup_misses); -#ifdef CONFIG_TRACE_IRQFLAGS - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -#else - nr_process_chains++; -#endif + inc_chains(); + + return 1; +} + +static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, + struct held_lock *hlock, int chain_head) +{ + /* + * Trylock needs to maintain the stack of held locks, but it + * does not add new dependencies, because trylock can be done + * in any order. + * + * We look up the chain_key and do the O(N^2) check and update of + * the dependencies only if this is a new dependency chain. + * (If lookup_chain_cache() returns with 1 it acquires + * graph_lock for us) + */ + if (!hlock->trylock && (hlock->check == 2) && + lookup_chain_cache(curr->curr_chain_key, hlock->class)) { + /* + * Check whether last held lock: + * + * - is irq-safe, if this lock is irq-unsafe + * - is softirq-safe, if this lock is hardirq-unsafe + * + * And check whether the new lock's dependency graph + * could lead back to the previous lock. + * + * any of these scenarios could lead to a deadlock. If + * All validations + */ + int ret = check_deadlock(curr, hlock, lock, hlock->read); + + if (!ret) + return 0; + /* + * Mark recursive read, as we jump over it when + * building dependencies (just like we jump over + * trylock entries): + */ + if (ret == 2) + hlock->read = 2; + /* + * Add dependency only if this lock is not the head + * of the chain, and if it's not a secondary read-lock: + */ + if (!chain_head && ret != 2) + if (!check_prevs_add(curr, hlock)) + return 0; + graph_unlock(); + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; return 1; } +#else +static inline int validate_chain(struct task_struct *curr, + struct lockdep_map *lock, struct held_lock *hlock, + int chain_head) +{ + return 1; +} +#endif /* * We are building curr_chain_key incrementally, so double-check @@ -1425,6 +1625,57 @@ static void check_chain_key(struct task_struct *curr) #endif } +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); + printk( "---------------------------------\n"); + + printk("inconsistent {%s} -> {%s} usage.\n", + usage_str[prev_bit], usage_str[new_bit]); + + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + curr->comm, curr->pid, + trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + trace_hardirqs_enabled(curr), + trace_softirqs_enabled(curr)); + print_lock(this); + + printk("{%s} state was registered at:\n", usage_str[prev_bit]); + print_stack_trace(this->class->usage_traces + prev_bit, 1); + + print_irqtrace_events(curr); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ + if (unlikely(this->class->usage_mask & (1 << bad_bit))) + return print_usage_bug(curr, this, bad_bit, new_bit); + return 1; +} + +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + #ifdef CONFIG_TRACE_IRQFLAGS /* @@ -1518,90 +1769,30 @@ void print_irqtrace_events(struct task_struct *curr) print_ip_sym(curr->softirq_disable_ip); } -#endif - -static int -print_usage_bug(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +static int hardirq_verbose(struct lock_class *class) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); - print_kernel_version(); - printk( "---------------------------------\n"); - - printk("inconsistent {%s} -> {%s} usage.\n", - usage_str[prev_bit], usage_str[new_bit]); - - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, curr->pid, - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); - print_lock(this); - - printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(this->class->usage_traces + prev_bit, 1); - - print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - +#if HARDIRQ_VERBOSE + return class_filter(class); +#endif return 0; } -/* - * Print out an error if an invalid bit is set: - */ -static inline int -valid_state(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +static int softirq_verbose(struct lock_class *class) { - if (unlikely(this->class->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); - return 1; +#if SOFTIRQ_VERBOSE + return class_filter(class); +#endif + return 0; } #define STRICT_READ_CHECKS 1 -/* - * Mark a lock with a usage bit, and validate the state transition: - */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) +static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { - unsigned int new_mask = 1 << new_bit, ret = 1; - - /* - * If already set then do not dirty the cacheline, - * nor do any checks: - */ - if (likely(this->class->usage_mask & new_mask)) - return 1; - - if (!graph_lock()) - return 0; - /* - * Make sure we didnt race: - */ - if (unlikely(this->class->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } - - this->class->usage_mask |= new_mask; + int ret = 1; - if (!save_trace(this->class->usage_traces + new_bit)) - return 0; - - switch (new_bit) { -#ifdef CONFIG_TRACE_IRQFLAGS + switch(new_bit) { case LOCK_USED_IN_HARDIRQ: if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS)) return 0; @@ -1760,37 +1951,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (softirq_verbose(this->class)) ret = 2; break; -#endif - case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); - debug_atomic_dec(&nr_unused_locks); - break; default: - if (!debug_locks_off_graph_unlock()) - return 0; WARN_ON(1); - return 0; - } - - graph_unlock(); - - /* - * We must printk outside of the graph_lock: - */ - if (ret == 2) { - printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); - print_lock(this); - print_irqtrace_events(curr); - dump_stack(); + break; } return ret; } -#ifdef CONFIG_TRACE_IRQFLAGS /* * Mark all held locks with a usage bit: */ @@ -1973,9 +2141,176 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(&redundant_softirqs_off); } +static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +{ + /* + * If non-trylock use in a hardirq or softirq context, then + * mark the lock as used in these contexts: + */ + if (!hlock->trylock) { + if (hlock->read) { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_HARDIRQ_READ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_SOFTIRQ_READ)) + return 0; + } else { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) + return 0; + } + } + if (!hlock->hardirqs_off) { + if (hlock->read) { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS_READ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQS)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQS)) + return 0; + } + } + + return 1; +} + +static int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + unsigned int depth = curr->lockdep_depth; + + /* + * Keep track of points where we cross into an interrupt context: + */ + hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + + curr->softirq_context; + if (depth) { + struct held_lock *prev_hlock; + + prev_hlock = curr->held_locks + depth-1; + /* + * If we cross into another context, reset the + * hash key (this also prevents the checking and the + * adding of the dependency to 'prev'): + */ + if (prev_hlock->irq_context != hlock->irq_context) + return 1; + } + return 0; +} + +#else + +static inline +int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + WARN_ON(1); + return 1; +} + +static inline int mark_irqflags(struct task_struct *curr, + struct held_lock *hlock) +{ + return 1; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + #endif /* + * Mark a lock with a usage bit, and validate the state transition: + */ +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + unsigned int new_mask = 1 << new_bit, ret = 1; + + /* + * If already set then do not dirty the cacheline, + * nor do any checks: + */ + if (likely(this->class->usage_mask & new_mask)) + return 1; + + if (!graph_lock()) + return 0; + /* + * Make sure we didnt race: + */ + if (unlikely(this->class->usage_mask & new_mask)) { + graph_unlock(); + return 1; + } + + this->class->usage_mask |= new_mask; + + if (!save_trace(this->class->usage_traces + new_bit)) + return 0; + + switch (new_bit) { + case LOCK_USED_IN_HARDIRQ: + case LOCK_USED_IN_SOFTIRQ: + case LOCK_USED_IN_HARDIRQ_READ: + case LOCK_USED_IN_SOFTIRQ_READ: + case LOCK_ENABLED_HARDIRQS: + case LOCK_ENABLED_SOFTIRQS: + case LOCK_ENABLED_HARDIRQS_READ: + case LOCK_ENABLED_SOFTIRQS_READ: + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); + debug_atomic_dec(&nr_unused_locks); + break; + default: + if (!debug_locks_off_graph_unlock()) + return 0; + WARN_ON(1); + return 0; + } + + graph_unlock(); + + /* + * We must printk outside of the graph_lock: + */ + if (ret == 2) { + printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); + print_lock(this); + print_irqtrace_events(curr); + dump_stack(); + } + + return ret; +} + +/* * Initialize a lock instance's lock-class mapping info: */ void lockdep_init_map(struct lockdep_map *lock, const char *name, @@ -1999,6 +2334,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; lock->key = key; lock->class_cache = NULL; +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif if (subclass) register_lock_class(lock, subclass, 1); } @@ -2020,6 +2358,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; u64 chain_key; + if (!prove_locking) + check = 1; + if (unlikely(!debug_locks)) return 0; @@ -2070,57 +2411,18 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->read = read; hlock->check = check; hlock->hardirqs_off = hardirqs_off; - - if (check != 2) - goto out_calc_hash; -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * If non-trylock use in a hardirq or softirq context, then - * mark the lock as used in these contexts: - */ - if (!trylock) { - if (read) { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - } else { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) - return 0; - } - } - if (!hardirqs_off) { - if (read) { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS)) - return 0; - } - } +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = sched_clock(); #endif + + if (check == 2 && !mark_irqflags(curr, hlock)) + return 0; + /* mark it as used: */ if (!mark_lock(curr, hlock, LOCK_USED)) return 0; -out_calc_hash: + /* * Calculate the chain hash: it's the combined has of all the * lock keys along the dependency chain. We save the hash value @@ -2143,77 +2445,15 @@ out_calc_hash: } hlock->prev_chain_key = chain_key; - -#ifdef CONFIG_TRACE_IRQFLAGS - /* - * Keep track of points where we cross into an interrupt context: - */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; - if (depth) { - struct held_lock *prev_hlock; - - prev_hlock = curr->held_locks + depth-1; - /* - * If we cross into another context, reset the - * hash key (this also prevents the checking and the - * adding of the dependency to 'prev'): - */ - if (prev_hlock->irq_context != hlock->irq_context) { - chain_key = 0; - chain_head = 1; - } + if (separate_irq_context(curr, hlock)) { + chain_key = 0; + chain_head = 1; } -#endif chain_key = iterate_chain_key(chain_key, id); curr->curr_chain_key = chain_key; - /* - * Trylock needs to maintain the stack of held locks, but it - * does not add new dependencies, because trylock can be done - * in any order. - * - * We look up the chain_key and do the O(N^2) check and update of - * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires - * graph_lock for us) - */ - if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { - /* - * Check whether last held lock: - * - * - is irq-safe, if this lock is irq-unsafe - * - is softirq-safe, if this lock is hardirq-unsafe - * - * And check whether the new lock's dependency graph - * could lead back to the previous lock. - * - * any of these scenarios could lead to a deadlock. If - * All validations - */ - int ret = check_deadlock(curr, hlock, lock, read); - - if (!ret) - return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* - * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: - */ - if (!chain_head && ret != 2) - if (!check_prevs_add(curr, hlock)) - return 0; - graph_unlock(); - } else - /* after lookup_chain_cache(): */ - if (unlikely(!debug_locks)) - return 0; + if (!validate_chain(curr, lock, hlock, chain_head)) + return 0; curr->lockdep_depth++; check_chain_key(curr); @@ -2315,6 +2555,8 @@ lock_release_non_nested(struct task_struct *curr, return print_unlock_inbalance_bug(curr, lock, ip); found_it: + lock_release_holdtime(hlock); + /* * We have the right lock to unlock, 'hlock' points to it. * Now we remove it from the stack, and add back the other @@ -2367,6 +2609,8 @@ static int lock_release_nested(struct task_struct *curr, curr->curr_chain_key = hlock->prev_chain_key; + lock_release_holdtime(hlock); + #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; hlock->class = NULL; @@ -2441,6 +2685,9 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2460,6 +2707,9 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { unsigned long flags; + if (unlikely(!lock_stat && !prove_locking)) + return; + if (unlikely(current->lockdep_recursion)) return; @@ -2473,6 +2723,166 @@ void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) EXPORT_SYMBOL_GPL(lock_release); +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ BUG: bad contention detected! ]\n"); + printk( "---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, curr->pid); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, point; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + hlock->waittime_stamp = sched_clock(); + + point = lock_contention_point(hlock->class, ip); + + stats = get_lock_stats(hlock->class); + if (point < ARRAY_SIZE(stats->contention_point)) + stats->contention_point[i]++; + if (lock->cpu != smp_processor_id()) + stats->bounces[bounce_contended + !!hlock->read]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now; + s64 waittime = 0; + int i, cpu; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + cpu = smp_processor_id(); + if (hlock->waittime_stamp) { + now = sched_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + } + + stats = get_lock_stats(hlock->class); + if (waittime) { + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + } + if (lock->cpu != cpu) + stats->bounces[bounce_acquired + !!hlock->read]++; + put_lock_stats(stats); + + lock->cpu = cpu; +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + /* * Used by the testsuite, sanitize the validator state * after a simulated failure: @@ -2636,8 +3046,11 @@ void __init lockdep_info(void) sizeof(struct held_lock) * MAX_LOCK_DEPTH); #ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) - printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n"); + if (lockdep_init_error) { + printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); + printk("Call stack leading to lockdep invocation was:\n"); + print_stack_trace(&lockdep_init_trace, 0); + } #endif } diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 58f35e586ee..c851b2dcc68 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -5,7 +5,8 @@ * * Started by Ingo Molnar: * - * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * * Code for /proc/lockdep and /proc/lockdep_stats: * @@ -15,6 +16,10 @@ #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/debug_locks.h> +#include <linux/vmalloc.h> +#include <linux/sort.h> +#include <asm/uaccess.h> +#include <asm/div64.h> #include "lockdep_internals.h" @@ -271,8 +276,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (nr_list_entries) factor = sum_forward_deps / nr_list_entries; +#ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); +#endif #ifdef CONFIG_TRACE_IRQFLAGS seq_printf(m, " in-hardirq chains: %11u\n", @@ -339,9 +346,295 @@ static const struct file_operations proc_lockdep_stats_operations = { .open = lockdep_stats_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, +}; + +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { + struct lock_class *class; + struct lock_class_stats stats; +}; + +struct lock_stat_seq { + struct lock_stat_data *iter; + struct lock_stat_data *iter_end; + struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; }; +/* + * sort on absolute number of contentions + */ +static int lock_stat_cmp(const void *l, const void *r) +{ + const struct lock_stat_data *dl = l, *dr = r; + unsigned long nl, nr; + + nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; + nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + + return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ + int i; + + for (i = 0; i < offset; i++) + seq_puts(m, " "); + for (i = 0; i < length; i++) + seq_printf(m, "%c", c); + seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ + unsigned long rem; + + rem = do_div(nr, 1000); /* XXX: do_div_signed */ + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ + char num[15]; + + snprint_time(num, sizeof(num), time); + seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ + seq_printf(m, "%14lu", lt->nr); + seq_time(m, lt->min); + seq_time(m, lt->max); + seq_time(m, lt->total); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ + char name[39]; + struct lock_class *class; + struct lock_class_stats *stats; + int i, namelen; + + class = data->class; + stats = &data->stats; + + namelen = 38; + if (class->name_version > 1) + namelen -= 2; /* XXX truncates versions > 9 */ + if (class->subclass) + namelen -= 2; + + if (!class->name) { + char str[KSYM_NAME_LEN]; + const char *key_name; + + key_name = __get_key_name(class->key, str); + snprintf(name, namelen, "%s", key_name); + } else { + snprintf(name, namelen, "%s", class->name); + } + namelen = strlen(name); + if (class->name_version > 1) { + snprintf(name+namelen, 3, "#%d", class->name_version); + namelen += 2; + } + if (class->subclass) { + snprintf(name+namelen, 3, "/%d", class->subclass); + namelen += 2; + } + + if (stats->write_holdtime.nr) { + if (stats->read_holdtime.nr) + seq_printf(m, "%38s-W:", name); + else + seq_printf(m, "%40s:", name); + + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); + seq_lock_time(m, &stats->write_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); + seq_lock_time(m, &stats->write_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_holdtime.nr) { + seq_printf(m, "%38s-R:", name); + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); + seq_lock_time(m, &stats->read_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); + seq_lock_time(m, &stats->read_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_waittime.nr + stats->write_waittime.nr == 0) + return; + + if (stats->read_holdtime.nr) + namelen += 2; + + for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { + char sym[KSYM_SYMBOL_LEN]; + char ip[32]; + + if (class->contention_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + sprint_symbol(sym, class->contention_point[i]); + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contention_point[i]); + seq_printf(m, "%40s %14lu %29s %s\n", name, + stats->contention_point[i], + ip, sym); + } + if (i) { + seq_puts(m, "\n"); + seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); + seq_puts(m, "\n"); + } +} + +static void seq_header(struct seq_file *m) +{ + seq_printf(m, "lock_stat version 0.2\n"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + "%14s %14s\n", + "class name", + "con-bounces", + "contentions", + "waittime-min", + "waittime-max", + "waittime-total", + "acq-bounces", + "acquisitions", + "holdtime-min", + "holdtime-max", + "holdtime-total"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + if (data->iter == data->stats) + seq_header(m); + + if (data->iter == data->iter_end) + data->iter = NULL; + + return data->iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + + (*pos)++; + + data->iter = v; + data->iter++; + if (data->iter == data->iter_end) + data->iter = NULL; + + return data->iter; +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ + struct lock_stat_seq *data = m->private; + + seq_stats(m, data->iter); + return 0; +} + +static struct seq_operations lockstat_ops = { + .start = ls_start, + .next = ls_next, + .stop = ls_stop, + .show = ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ + int res; + struct lock_class *class; + struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + + if (!data) + return -ENOMEM; + + res = seq_open(file, &lockstat_ops); + if (!res) { + struct lock_stat_data *iter = data->stats; + struct seq_file *m = file->private_data; + + data->iter = iter; + list_for_each_entry(class, &all_lock_classes, lock_entry) { + iter->class = class; + iter->stats = lock_stats(class); + iter++; + } + data->iter_end = iter; + + sort(data->stats, data->iter_end - data->iter, + sizeof(struct lock_stat_data), + lock_stat_cmp, NULL); + + m->private = data; + } else + vfree(data); + + return res; +} + +static ssize_t lock_stat_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct lock_class *class; + char c; + + if (count) { + if (get_user(c, buf)) + return -EFAULT; + + if (c != '0') + return count; + + list_for_each_entry(class, &all_lock_classes, lock_entry) + clear_lock_stats(class); + } + return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + vfree(seq->private); + seq->private = NULL; + return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { + .open = lock_stat_open, + .write = lock_stat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + static int __init lockdep_proc_init(void) { struct proc_dir_entry *entry; @@ -354,6 +647,12 @@ static int __init lockdep_proc_init(void) if (entry) entry->proc_fops = &proc_lockdep_stats_operations; +#ifdef CONFIG_LOCK_STAT + entry = create_proc_entry("lock_stat", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lock_stat_operations; +#endif + return 0; } diff --git a/kernel/module.c b/kernel/module.c index 33c04ad5117..db0ead0363e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module *mod, char *buffer) { - /* sysfs holds a reference */ - return sprintf(buffer, "%u\n", module_refcount(mod)-1); + return sprintf(buffer, "%u\n", module_refcount(mod)); } static struct module_attribute refcnt = { diff --git a/kernel/mutex.c b/kernel/mutex.c index 303eab18484..691b86564dd 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -139,6 +139,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + old_val = atomic_xchg(&lock->count, -1); + if (old_val == 1) + goto done; + + lock_contended(&lock->dep_map, _RET_IP_); + for (;;) { /* * Lets try to take the lock again - this is needed even if @@ -174,6 +180,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass) spin_lock_mutex(&lock->wait_lock, flags); } +done: + lock_acquired(&lock->dep_map); /* got the lock - rejoice! */ mutex_remove_waiter(lock, &waiter, task_thread_info(task)); debug_mutex_set_owner(lock, task_thread_info(task)); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 10f0bbba382..f1decd21a53 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -20,6 +20,7 @@ #include <linux/mnt_namespace.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> +#include <net/net_namespace.h> static struct kmem_cache *nsproxy_cachep; @@ -98,8 +99,17 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_user; } + new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + if (IS_ERR(new_nsp->net_ns)) { + err = PTR_ERR(new_nsp->net_ns); + goto out_net; + } + return new_nsp; +out_net: + if (new_nsp->user_ns) + put_user_ns(new_nsp->user_ns); out_user: if (new_nsp->pid_ns) put_pid_ns(new_nsp->pid_ns); @@ -132,7 +142,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) get_nsproxy(old_ns); - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER))) + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWUSER | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) { @@ -164,6 +174,7 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); if (ns->user_ns) put_user_ns(ns->user_ns); + put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -177,7 +188,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWUSER))) + CLONE_NEWUSER | CLONE_NEWNET))) return 0; if (!capable(CAP_SYS_ADMIN)) @@ -193,7 +204,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, static int __init nsproxy_cache_init(void) { nsproxy_cachep = kmem_cache_create("nsproxy", sizeof(struct nsproxy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); return 0; } diff --git a/kernel/params.c b/kernel/params.c index effbaaedd7f..4e57732fcfb 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -567,7 +567,12 @@ static void __init kernel_param_sysfs_setup(const char *name, kobject_set_name(&mk->kobj, name); kobject_init(&mk->kobj); ret = kobject_add(&mk->kobj); - BUG_ON(ret < 0); + if (ret) { + printk(KERN_ERR "Module '%s' failed to be added to sysfs, " + "error number %d\n", name, ret); + printk(KERN_ERR "The system will be unstable now.\n"); + return; + } param_sysfs_setup(mk, kparam, num_params, name_skip); kobject_uevent(&mk->kobj, KOBJ_ADD); } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 329ce017207..7a15afb73ed 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -241,7 +241,7 @@ static __init int init_posix_timers(void) register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof (struct k_itimer), 0, 0, NULL, NULL); + sizeof (struct k_itimer), 0, 0, NULL); idr_init(&posix_timers_id); return 0; } @@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) get_task_struct(process); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } else { spin_unlock_irqrestore(&process->sighand->siglock, flags); process = NULL; @@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); if (timr) { spin_lock(&timr->it_lock); - spin_unlock(&idr_lock); if ((timr->it_id != timer_id) || !(timr->it_process) || timr->it_process->tgid != current->tgid) { - unlock_timer(timr, *flags); + spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; - } + } else + spin_unlock(&idr_lock); } else spin_unlock_irqrestore(&idr_lock, *flags); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 495b7d4dd33..14b0e10dc95 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -33,13 +33,20 @@ config PM_DEBUG bool "Power Management Debug Support" depends on PM ---help--- - This option enables verbose debugging support in the Power Management - code. This is helpful when debugging and reporting various PM bugs, - like suspend support. + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_VERBOSE + bool "Verbose Power Management debugging" + depends on PM_DEBUG + default n + ---help--- + This option enables verbose messages from the Power Management code. config DISABLE_CONSOLE_SUSPEND bool "Keep console(s) enabled during suspend/resume (DANGEROUS)" - depends on PM && PM_DEBUG + depends on PM_DEBUG && PM_SLEEP default n ---help--- This option turns off the console suspend mechanism that prevents @@ -50,7 +57,7 @@ config DISABLE_CONSOLE_SUSPEND config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL + depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL default n ---help--- This enables some cheesy code to save the last PM event point in the @@ -65,21 +72,58 @@ config PM_TRACE CAUTION: this option will cause your machine's real-time clock to be set to an invalid time after a resume. -config PM_SYSFS_DEPRECATED - bool "Driver model /sys/devices/.../power/state files (DEPRECATED)" - depends on PM && SYSFS - default n - help - The driver model started out with a sysfs file intended to provide - a userspace hook for device power management. This feature has never - worked very well, except for limited testing purposes, and so it will - be removed. It's not clear that a generic mechanism could really - handle the wide variability of device power states; any replacements - are likely to be bus or driver specific. - -config SOFTWARE_SUSPEND - bool "Software Suspend (Hibernation)" - depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) +config PM_SLEEP_SMP + bool + depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE + depends on PM_SLEEP + select HOTPLUG_CPU + default y + +config PM_SLEEP + bool + depends on SUSPEND || HIBERNATION + default y + +config SUSPEND_UP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ + || SUPERH || FRV + depends on !SMP + default y + +config SUSPEND_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) \ + || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM + depends on SMP + default y + +config SUSPEND + bool "Suspend to RAM and standby" + depends on PM + depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE + default y + ---help--- + Allow the system to enter sleep states in which main memory is + powered and thus its contents are preserved, such as the + suspend-to-RAM state (i.e. the ACPI S3 state). + +config HIBERNATION_UP_POSSIBLE + bool + depends on X86 || PPC64_SWSUSP || PPC32 + depends on !SMP + default y + +config HIBERNATION_SMP_POSSIBLE + bool + depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP + depends on SMP + default y + +config HIBERNATION + bool "Hibernation (aka 'suspend to disk')" + depends on PM && SWAP + depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -117,7 +161,7 @@ config SOFTWARE_SUSPEND config PM_STD_PARTITION string "Default resume partition" - depends on SOFTWARE_SUSPEND + depends on HIBERNATION default "" ---help--- The default resume partition is the partition that the suspend- @@ -137,11 +181,6 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. -config SUSPEND_SMP - bool - depends on HOTPLUG_CPU && (X86 || PPC64) && PM - default y - config APM_EMULATION tristate "Advanced Power Management Emulation" depends on PM && SYS_SUPPORTS_APM_EMULATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 38725f526af..f7dfff28ecd 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif -obj-y := main.o process.o console.o +obj-y := main.o obj-$(CONFIG_PM_LEGACY) += pm.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o +obj-$(CONFIG_PM_SLEEP) += process.o console.o +obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f445b9cd60f..eb72255b5c8 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -45,7 +45,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -struct hibernation_ops *hibernation_ops; +static struct hibernation_ops *hibernation_ops; /** * hibernation_set_ops - set the global hibernate operations @@ -54,7 +54,8 @@ struct hibernation_ops *hibernation_ops; void hibernation_set_ops(struct hibernation_ops *ops) { - if (ops && !(ops->prepare && ops->enter && ops->finish)) { + if (ops && !(ops->prepare && ops->enter && ops->finish + && ops->pre_restore && ops->restore_cleanup)) { WARN_ON(1); return; } @@ -74,9 +75,9 @@ void hibernation_set_ops(struct hibernation_ops *ops) * platform driver if so configured and return an error code if it fails */ -static int platform_prepare(void) +static int platform_prepare(int platform_mode) { - return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ? + return (platform_mode && hibernation_ops) ? hibernation_ops->prepare() : 0; } @@ -85,13 +86,146 @@ static int platform_prepare(void) * using the platform driver (must be called after platform_prepare()) */ -static void platform_finish(void) +static void platform_finish(int platform_mode) { - if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) + if (platform_mode && hibernation_ops) hibernation_ops->finish(); } /** + * platform_pre_restore - prepare the platform for the restoration from a + * hibernation image. If the restore fails after this function has been + * called, platform_restore_cleanup() must be called. + */ + +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - switch the platform to the normal mode of + * operation after a failing restore. If platform_pre_restore() has been + * called before the failing restore, this function must be called too, + * regardless of the result of platform_pre_restore(). + */ + +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * hibernation_snapshot - quiesce devices and create the hibernation + * snapshot image. + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform frimware for the power transition. + * + * Must be called with pm_mutex held + */ + +int hibernation_snapshot(int platform_mode) +{ + int error; + + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + return error; + + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_console; + + error = platform_prepare(platform_mode); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) { + if (hibernation_mode != HIBERNATION_TEST) { + in_suspend = 1; + error = swsusp_suspend(); + /* Control returns here after successful restore */ + } else { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + } + } + enable_nonboot_cpus(); + Resume_devices: + platform_finish(platform_mode); + device_resume(); + Resume_console: + resume_console(); + return error; +} + +/** + * hibernation_restore - quiesce devices and restore the hibernation + * snapshot image. If successful, control returns in hibernation_snaphot() + * @platform_mode - if set, use the platform driver, if available, to + * prepare the platform frimware for the transition. + * + * Must be called with pm_mutex held + */ + +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Finish; + + error = platform_pre_restore(platform_mode); + if (!error) { + error = disable_nonboot_cpus(); + if (!error) + error = swsusp_resume(); + enable_nonboot_cpus(); + } + platform_restore_cleanup(platform_mode); + device_resume(); + Finish: + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - enter the hibernation state using the + * platform driver (if available) + */ + +int hibernation_platform_enter(void) +{ + int error; + + if (hibernation_ops) { + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we + * should let the firmware know that we're going to enter the + * sleep state after all + */ + error = hibernation_ops->prepare(); + sysdev_shutdown(); + if (!error) + error = hibernation_ops->enter(); + } else { + error = -ENOSYS; + } + return error; +} + +/** * power_down - Shut the machine down for hibernation. * * Use the platform driver, if configured so; otherwise try @@ -111,11 +245,7 @@ static void power_down(void) kernel_restart(NULL); break; case HIBERNATION_PLATFORM: - if (hibernation_ops) { - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - hibernation_ops->enter(); - break; - } + hibernation_platform_enter(); } kernel_halt(); /* @@ -152,9 +282,16 @@ int hibernate(void) { int error; + mutex_lock(&pm_mutex); /* The snapshot device should not be opened while we're running */ - if (!atomic_add_unless(&snapshot_device_available, -1, 0)) - return -EBUSY; + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -165,75 +302,35 @@ int hibernate(void) if (error) goto Finish; - mutex_lock(&pm_mutex); if (hibernation_mode == HIBERNATION_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Thaw; } + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (in_suspend && !error) { + unsigned int flags = 0; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - goto Thaw; - - error = platform_prepare(); - if (error) - goto Thaw; - - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) { - printk(KERN_ERR "PM: Some devices failed to suspend\n"); - goto Resume_devices; - } - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpus; - - if (hibernation_mode == HIBERNATION_TEST) { - printk("swsusp debug: Waiting for 5 seconds.\n"); - mdelay(5000); - goto Enable_cpus; - } - - pr_debug("PM: snapshotting memory.\n"); - in_suspend = 1; - error = swsusp_suspend(); - if (error) - goto Enable_cpus; - - if (in_suspend) { - enable_nonboot_cpus(); - platform_finish(); - device_resume(); - resume_console(); + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; pr_debug("PM: writing image.\n"); - error = swsusp_write(); + error = swsusp_write(flags); + swsusp_free(); if (!error) power_down(); - else { - swsusp_free(); - goto Thaw; - } } else { pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); } - - swsusp_free(); - Enable_cpus: - enable_nonboot_cpus(); - Resume_devices: - platform_finish(); - device_resume(); - resume_console(); Thaw: - mutex_unlock(&pm_mutex); unprepare_processes(); Finish: free_basic_memory_bitmaps(); Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); atomic_inc(&snapshot_device_available); + Unlock: + mutex_unlock(&pm_mutex); return error; } @@ -253,6 +350,7 @@ int hibernate(void) static int software_resume(void) { int error; + unsigned int flags; mutex_lock(&pm_mutex); if (!swsusp_resume_device) { @@ -300,30 +398,12 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - error = swsusp_read(); - if (error) { - swsusp_free(); - goto Thaw; - } - - pr_debug("PM: Preparing devices for restore.\n"); - - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (error) - goto Free; - - error = disable_nonboot_cpus(); + error = swsusp_read(&flags); if (!error) - swsusp_resume(); + hibernation_restore(flags & SF_PLATFORM_MODE); - enable_nonboot_cpus(); - Free: - swsusp_free(); - device_resume(); - resume_console(); - Thaw: printk(KERN_ERR "PM: Restore failed, recovering.\n"); + swsusp_free(); unprepare_processes(); Done: free_basic_memory_bitmaps(); @@ -333,7 +413,7 @@ static int software_resume(void) Unlock: mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); - return 0; + return error; } late_initcall(software_resume); diff --git a/kernel/power/main.c b/kernel/power/main.c index fc45ed22620..350b485b3b6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -23,11 +23,15 @@ #include "power.h" -/*This is just an arbitrary number */ -#define FREE_PAGE_NUMBER (100) +BLOCKING_NOTIFIER_HEAD(pm_chain_head); DEFINE_MUTEX(pm_mutex); +#ifdef CONFIG_SUSPEND + +/* This is just an arbitrary number */ +#define FREE_PAGE_NUMBER (100) + struct pm_ops *pm_ops; /** @@ -63,14 +67,11 @@ static inline void pm_finish(suspend_state_t state) /** * suspend_prepare - Do prep work before entering low-power state. - * @state: State we're entering. * - * This is common code that is called for each state that we're - * entering. Allocate a console, stop all processes, then make sure - * the platform can enter the requested state. + * This is common code that is called for each state that we're entering. + * Run suspend notifiers, allocate a console and stop all processes. */ - -static int suspend_prepare(suspend_state_t state) +static int suspend_prepare(void) { int error; unsigned int free_pages; @@ -78,6 +79,10 @@ static int suspend_prepare(suspend_state_t state) if (!pm_ops || !pm_ops->enter) return -EPERM; + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + pm_prepare_console(); if (freeze_processes()) { @@ -85,46 +90,23 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } - if ((free_pages = global_page_state(NR_FREE_PAGES)) - < FREE_PAGE_NUMBER) { + free_pages = global_page_state(NR_FREE_PAGES); + if (free_pages < FREE_PAGE_NUMBER) { pr_debug("PM: free some memory\n"); shrink_all_memory(FREE_PAGE_NUMBER - free_pages); if (nr_free_pages() < FREE_PAGE_NUMBER) { error = -ENOMEM; printk(KERN_ERR "PM: No enough memory\n"); - goto Thaw; } } - - if (pm_ops->set_target) { - error = pm_ops->set_target(state); - if (error) - goto Thaw; - } - suspend_console(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "Some devices failed to suspend\n"); - goto Resume_console; - } - if (pm_ops->prepare) { - if ((error = pm_ops->prepare(state))) - goto Resume_devices; - } - - error = disable_nonboot_cpus(); if (!error) return 0; - enable_nonboot_cpus(); - pm_finish(state); - Resume_devices: - device_resume(); - Resume_console: - resume_console(); Thaw: thaw_processes(); pm_restore_console(); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); return error; } @@ -140,6 +122,12 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) local_irq_enable(); } +/** + * suspend_enter - enter the desired system sleep state. + * @state: state to enter + * + * This function should be called after devices have been suspended. + */ int suspend_enter(suspend_state_t state) { int error = 0; @@ -159,23 +147,58 @@ int suspend_enter(suspend_state_t state) return error; } +/** + * suspend_devices_and_enter - suspend devices and enter the desired system sleep + * state. + * @state: state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + + if (!pm_ops) + return -ENOSYS; + + if (pm_ops->set_target) { + error = pm_ops->set_target(state); + if (error) + return error; + } + suspend_console(); + error = device_suspend(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "Some devices failed to suspend\n"); + goto Resume_console; + } + if (pm_ops->prepare) { + error = pm_ops->prepare(state); + if (error) + goto Resume_devices; + } + error = disable_nonboot_cpus(); + if (!error) + suspend_enter(state); + + enable_nonboot_cpus(); + pm_finish(state); + Resume_devices: + device_resume(); + Resume_console: + resume_console(); + return error; +} /** * suspend_finish - Do final work before exiting suspend sequence. - * @state: State we're coming out of. * * Call platform code to clean up, restart processes, and free the * console that we've allocated. This is not called for suspend-to-disk. */ - -static void suspend_finish(suspend_state_t state) +static void suspend_finish(void) { - enable_nonboot_cpus(); - pm_finish(state); - device_resume(); - resume_console(); thaw_processes(); pm_restore_console(); + pm_notifier_call_chain(PM_POST_SUSPEND); } @@ -207,7 +230,6 @@ static inline int valid_state(suspend_state_t state) * Then, do the setup for suspend, enter the state, and cleaup (after * we've woken up). */ - static int enter_state(suspend_state_t state) { int error; @@ -218,14 +240,14 @@ static int enter_state(suspend_state_t state) return -EBUSY; pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); - if ((error = suspend_prepare(state))) + if ((error = suspend_prepare())) goto Unlock; pr_debug("PM: Entering %s sleep\n", pm_states[state]); - error = suspend_enter(state); + error = suspend_devices_and_enter(state); pr_debug("PM: Finishing wakeup.\n"); - suspend_finish(state); + suspend_finish(); Unlock: mutex_unlock(&pm_mutex); return error; @@ -249,6 +271,8 @@ int pm_suspend(suspend_state_t state) EXPORT_SYMBOL(pm_suspend); +#endif /* CONFIG_SUSPEND */ + decl_subsys(power,NULL,NULL); @@ -265,14 +289,16 @@ decl_subsys(power,NULL,NULL); static ssize_t state_show(struct kset *kset, char *buf) { + char *s = buf; +#ifdef CONFIG_SUSPEND int i; - char * s = buf; for (i = 0; i < PM_SUSPEND_MAX; i++) { if (pm_states[i] && valid_state(i)) s += sprintf(s,"%s ", pm_states[i]); } -#ifdef CONFIG_SOFTWARE_SUSPEND +#endif +#ifdef CONFIG_HIBERNATION s += sprintf(s, "%s\n", "disk"); #else if (s != buf) @@ -284,11 +310,13 @@ static ssize_t state_show(struct kset *kset, char *buf) static ssize_t state_store(struct kset *kset, const char *buf, size_t n) { +#ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_STANDBY; const char * const *s; +#endif char *p; - int error; int len; + int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; @@ -296,17 +324,19 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n) /* First, check if we are requested to hibernate */ if (len == 4 && !strncmp(buf, "disk", len)) { error = hibernate(); - return error ? error : n; + goto Exit; } +#ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) break; } if (state < PM_SUSPEND_MAX && *s) error = enter_state(state); - else - error = -EINVAL; +#endif + + Exit: return error ? error : n; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 51381487103..95fbf2dd3fe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -13,7 +13,7 @@ struct swsusp_info { -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_HIBERNATION /* * Keep some memory free so that I/O operations can succeed without paging * [Might this be more than 4 MB?] @@ -25,7 +25,10 @@ struct swsusp_info { */ #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) -extern struct hibernation_ops *hibernation_ops; +/* kernel/power/disk.c */ +extern int hibernation_snapshot(int platform_mode); +extern int hibernation_restore(int platform_mode); +extern int hibernation_platform_enter(void); #endif extern int pfn_is_nosave(unsigned long); @@ -152,16 +155,42 @@ extern sector_t alloc_swapdev_block(int swap); extern void free_all_swap_pages(int swap); extern int swsusp_swap_in_use(void); +/* + * Flags that can be passed from the hibernatig hernel to the "boot" kernel in + * the image header. + */ +#define SF_PLATFORM_MODE 1 + +/* kernel/power/disk.c */ extern int swsusp_check(void); extern int swsusp_shrink_memory(void); extern void swsusp_free(void); extern int swsusp_suspend(void); extern int swsusp_resume(void); -extern int swsusp_read(void); -extern int swsusp_write(void); +extern int swsusp_read(unsigned int *flags_p); +extern int swsusp_write(unsigned int flags); extern void swsusp_close(void); -extern int suspend_enter(suspend_state_t state); struct timeval; +/* kernel/power/swsusp.c */ extern void swsusp_show_speed(struct timeval *, struct timeval *, unsigned int, char *); + +#ifdef CONFIG_SUSPEND +/* kernel/power/main.c */ +extern int suspend_devices_and_enter(suspend_state_t state); +#else /* !CONFIG_SUSPEND */ +static inline int suspend_devices_and_enter(suspend_state_t state) +{ + return -ENOSYS; +} +#endif /* !CONFIG_SUSPEND */ + +/* kernel/power/common.c */ +extern struct blocking_notifier_head pm_chain_head; + +static inline int pm_notifier_call_chain(unsigned long val) +{ + return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) + == NOTIFY_BAD) ? -EINVAL : 0; +} diff --git a/kernel/power/process.c b/kernel/power/process.c index e0233d8422b..3434940a3df 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -40,7 +40,7 @@ static inline void frozen_process(void) current->flags |= PF_FROZEN; wmb(); } - clear_tsk_thread_flag(current, TIF_FREEZE); + clear_freeze_flag(current); } /* Refrigerator is place where frozen processes are stored :-). */ @@ -72,20 +72,19 @@ void refrigerator(void) schedule(); } pr_debug("%s left refrigerator\n", current->comm); - current->state = save; + __set_current_state(save); } -static inline void freeze_process(struct task_struct *p) +static void freeze_task(struct task_struct *p) { unsigned long flags; if (!freezing(p)) { rmb(); if (!frozen(p)) { + set_freeze_flag(p); if (p->state == TASK_STOPPED) force_sig_specific(SIGSTOP, p); - - freeze(p); spin_lock_irqsave(&p->sighand->siglock, flags); signal_wake_up(p, p->state == TASK_STOPPED); spin_unlock_irqrestore(&p->sighand->siglock, flags); @@ -99,19 +98,14 @@ static void cancel_freezing(struct task_struct *p) if (freezing(p)) { pr_debug(" clean up: %s\n", p->comm); - do_not_freeze(p); + clear_freeze_flag(p); spin_lock_irqsave(&p->sighand->siglock, flags); recalc_sigpending_and_wake(p); spin_unlock_irqrestore(&p->sighand->siglock, flags); } } -static inline int is_user_space(struct task_struct *p) -{ - return p->mm && !(p->flags & PF_BORROWED_MM); -} - -static unsigned int try_to_freeze_tasks(int freeze_user_space) +static int try_to_freeze_tasks(int freeze_user_space) { struct task_struct *g, *p; unsigned long end_time; @@ -122,26 +116,40 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) + if (frozen(p) || !freezeable(p)) continue; - if (frozen(p)) - continue; - - if (p->state == TASK_TRACED && frozen(p->parent)) { - cancel_freezing(p); - continue; + if (freeze_user_space) { + if (p->state == TASK_TRACED && + frozen(p->parent)) { + cancel_freezing(p); + continue; + } + /* + * Kernel threads should not have TIF_FREEZE set + * at this point, so we must ensure that either + * p->mm is not NULL *and* PF_BORROWED_MM is + * unset, or TIF_FRREZE is left unset. + * The task_lock() is necessary to prevent races + * with exit_mm() or use_mm()/unuse_mm() from + * occuring. + */ + task_lock(p); + if (!p->mm || (p->flags & PF_BORROWED_MM)) { + task_unlock(p); + continue; + } + freeze_task(p); + task_unlock(p); + } else { + freeze_task(p); } - if (freeze_user_space && !is_user_space(p)) - continue; - - freeze_process(p); if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (todo && time_after(jiffies, end_time)) + if (time_after(jiffies, end_time)) break; } while (todo); @@ -152,49 +160,41 @@ static unsigned int try_to_freeze_tasks(int freeze_user_space) * but it cleans up leftover PF_FREEZE requests. */ printk("\n"); - printk(KERN_ERR "Stopping %s timed out after %d seconds " + printk(KERN_ERR "Freezing of %s timed out after %d seconds " "(%d tasks refusing to freeze):\n", - freeze_user_space ? "user space processes" : - "kernel threads", + freeze_user_space ? "user space " : "tasks ", TIMEOUT / HZ, todo); + show_state(); read_lock(&tasklist_lock); do_each_thread(g, p) { - if (freeze_user_space && !is_user_space(p)) - continue; - task_lock(p); - if (freezeable(p) && !frozen(p) && - !freezer_should_skip(p)) + if (freezing(p) && !freezer_should_skip(p)) printk(KERN_ERR " %s\n", p->comm); - cancel_freezing(p); task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } - return todo; + return todo ? -EBUSY : 0; } /** * freeze_processes - tell processes to enter the refrigerator - * - * Returns 0 on success, or the number of processes that didn't freeze, - * although they were told to. */ int freeze_processes(void) { - unsigned int nr_unfrozen; + int error; printk("Stopping tasks ... "); - nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); - if (nr_unfrozen) - return nr_unfrozen; + error = try_to_freeze_tasks(FREEZER_USER_SPACE); + if (error) + return error; sys_sync(); - nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); - if (nr_unfrozen) - return nr_unfrozen; + error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); + if (error) + return error; printk("done.\n"); BUG_ON(in_atomic()); @@ -210,7 +210,7 @@ static void thaw_tasks(int thaw_user_space) if (!freezeable(p)) continue; - if (is_user_space(p) == !thaw_user_space) + if (!p->mm == thaw_user_space) continue; thaw_process(p); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a3b7854b8f7..a686590d88c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -709,7 +709,8 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) + memory_bm_set_bit(bm, pfn); } } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8b1a1b83714..917aba10057 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -33,8 +33,9 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; sector_t image; + unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; char sig[10]; } __attribute__((packed)); @@ -138,7 +139,7 @@ static int wait_on_bio_chain(struct bio **bio_chain) * Saving part */ -static int mark_swapfiles(sector_t start) +static int mark_swapfiles(sector_t start, unsigned int flags) { int error; @@ -148,6 +149,7 @@ static int mark_swapfiles(sector_t start) memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig,SWSUSP_SIG, 10); swsusp_header->image = start; + swsusp_header->flags = flags; error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { @@ -369,6 +371,7 @@ static int enough_swap(unsigned int nr_pages) /** * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header * * It is important _NOT_ to umount filesystems at this point. We want * them synced (in case something goes wrong) but we DO not want to mark @@ -376,7 +379,7 @@ static int enough_swap(unsigned int nr_pages) * correctly, we'll mark system clean, anyway.) */ -int swsusp_write(void) +int swsusp_write(unsigned int flags) { struct swap_map_handle handle; struct snapshot_handle snapshot; @@ -415,7 +418,7 @@ int swsusp_write(void) if (!error) { flush_swap_writer(&handle); printk("S"); - error = mark_swapfiles(start); + error = mark_swapfiles(start, flags); printk("|\n"); } } @@ -540,13 +543,20 @@ static int load_image(struct swap_map_handle *handle, return error; } -int swsusp_read(void) +/** + * swsusp_read - read the hibernation image. + * @flags_p: flags passed by the "frozen" kernel in the image header should + * be written into this memeory location + */ + +int swsusp_read(unsigned int *flags_p) { int error; struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + *flags_p = swsusp_header->flags; if (IS_ERR(resume_bdev)) { pr_debug("swsusp: block device not initialised\n"); return PTR_ERR(resume_bdev); diff --git a/kernel/power/user.c b/kernel/power/user.c index d65305b515b..bd0723a7df3 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -128,92 +128,6 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static inline int platform_prepare(void) -{ - int error = 0; - - if (hibernation_ops) - error = hibernation_ops->prepare(); - - return error; -} - -static inline void platform_finish(void) -{ - if (hibernation_ops) - hibernation_ops->finish(); -} - -static inline int snapshot_suspend(int platform_suspend) -{ - int error; - - mutex_lock(&pm_mutex); - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (error) - goto Finish; - - if (platform_suspend) { - error = platform_prepare(); - if (error) - goto Finish; - } - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (error) - goto Resume_devices; - - error = disable_nonboot_cpus(); - if (!error) { - in_suspend = 1; - error = swsusp_suspend(); - } - enable_nonboot_cpus(); - Resume_devices: - if (platform_suspend) - platform_finish(); - - device_resume(); - resume_console(); - Finish: - mutex_unlock(&pm_mutex); - return error; -} - -static inline int snapshot_restore(int platform_suspend) -{ - int error; - - mutex_lock(&pm_mutex); - pm_prepare_console(); - if (platform_suspend) { - error = platform_prepare(); - if (error) - goto Finish; - } - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (error) - goto Resume_devices; - - error = disable_nonboot_cpus(); - if (!error) - error = swsusp_resume(); - - enable_nonboot_cpus(); - Resume_devices: - if (platform_suspend) - platform_finish(); - - device_resume(); - resume_console(); - Finish: - pm_restore_console(); - mutex_unlock(&pm_mutex); - return error; -} - static int snapshot_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { @@ -237,10 +151,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (data->frozen) break; mutex_lock(&pm_mutex); - if (freeze_processes()) { - thaw_processes(); - error = -EBUSY; + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (!error) { + error = freeze_processes(); + if (error) + thaw_processes(); } + if (error) + pm_notifier_call_chain(PM_POST_HIBERNATION); mutex_unlock(&pm_mutex); if (!error) data->frozen = 1; @@ -251,6 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; mutex_lock(&pm_mutex); thaw_processes(); + pm_notifier_call_chain(PM_POST_HIBERNATION); mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -260,7 +179,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_suspend(data->platform_suspend); + error = hibernation_snapshot(data->platform_suspend); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -274,7 +193,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_restore(data->platform_suspend); + error = hibernation_restore(data->platform_suspend); break; case SNAPSHOT_FREE: @@ -336,47 +255,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_S2RAM: - if (!pm_ops) { - error = -ENOSYS; - break; - } - if (!data->frozen) { error = -EPERM; break; } - if (!mutex_trylock(&pm_mutex)) { error = -EBUSY; break; } - - if (pm_ops->prepare) { - error = pm_ops->prepare(PM_SUSPEND_MEM); - if (error) - goto OutS3; - } - - /* Put devices to sleep */ - suspend_console(); - error = device_suspend(PMSG_SUSPEND); - if (error) { - printk(KERN_ERR "Failed to suspend some devices.\n"); - } else { - error = disable_nonboot_cpus(); - if (!error) { - /* Enter S3, system is already frozen */ - suspend_enter(PM_SUSPEND_MEM); - enable_nonboot_cpus(); - } - /* Wake up devices */ - device_resume(); - } - resume_console(); - if (pm_ops->finish) - pm_ops->finish(PM_SUSPEND_MEM); - - OutS3: + /* + * Tasks are frozen and the notifiers have been called with + * PM_HIBERNATION_PREPARE + */ + error = suspend_devices_and_enter(PM_SUSPEND_MEM); mutex_unlock(&pm_mutex); break; @@ -386,19 +277,14 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, switch (arg) { case PMOPS_PREPARE: - if (hibernation_ops) { - data->platform_suspend = 1; - error = 0; - } else { - error = -ENOSYS; - } + data->platform_suspend = 1; + error = 0; break; case PMOPS_ENTER: - if (data->platform_suspend) { - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = hibernation_ops->enter(); - } + if (data->platform_suspend) + error = hibernation_platform_enter(); + break; case PMOPS_FINISH: diff --git a/kernel/printk.c b/kernel/printk.c index 051d27e36a6..8451dfc31d2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -732,7 +732,7 @@ int __init add_preferred_console(char *name, int idx, char *options) return 0; } -int __init update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) +int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) { struct console_cmdline *c; int i; @@ -1083,6 +1083,19 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +static int __init disable_boot_consoles(void) +{ + if (console_drivers != NULL) { + if (console_drivers->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + console_drivers->name, console_drivers->index); + return unregister_console(console_drivers); + } + } + return 0; +} +late_initcall(disable_boot_consoles); + /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct diff --git a/kernel/profile.c b/kernel/profile.c index 5b20fe977be..cb1e37d2dac 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -199,11 +199,11 @@ EXPORT_SYMBOL_GPL(register_timer_hook); EXPORT_SYMBOL_GPL(unregister_timer_hook); EXPORT_SYMBOL_GPL(task_handoff_register); EXPORT_SYMBOL_GPL(task_handoff_unregister); +EXPORT_SYMBOL_GPL(profile_event_register); +EXPORT_SYMBOL_GPL(profile_event_unregister); #endif /* CONFIG_PROFILING */ -EXPORT_SYMBOL_GPL(profile_event_register); -EXPORT_SYMBOL_GPL(profile_event_unregister); #ifdef CONFIG_SMP /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4a1745f1dad..3eca7a55f2e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -142,7 +142,7 @@ static int may_attach(struct task_struct *task) return -EPERM; smp_rmb(); if (task->mm) - dumpable = task->mm->dumpable; + dumpable = get_dumpable(task->mm); if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; @@ -233,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) /* Architecture-specific hardware disable .. */ ptrace_disable(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); /* protect against de_thread()->release_task() */ diff --git a/kernel/relay.c b/kernel/relay.c index a615a8f513f..ad855017bc5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1,7 +1,7 @@ /* * Public API and common code for kernel->userspace relay file support. * - * See Documentation/filesystems/relayfs.txt for an overview of relayfs. + * See Documentation/filesystems/relay.txt for an overview. * * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) @@ -80,7 +80,7 @@ static struct vm_operations_struct relay_file_mmap_ops = { * * Caller should already have grabbed mmap_sem. */ -int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { unsigned long length = vma->vm_end - vma->vm_start; struct file *filp = vma->vm_file; @@ -145,7 +145,7 @@ depopulate: * * Returns channel buffer if successful, %NULL otherwise. */ -struct rchan_buf *relay_create_buf(struct rchan *chan) +static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); if (!buf) @@ -175,7 +175,7 @@ free_buf: * * Should only be called from kref_put(). */ -void relay_destroy_channel(struct kref *kref) +static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); kfree(chan); @@ -185,7 +185,7 @@ void relay_destroy_channel(struct kref *kref) * relay_destroy_buf - destroy an rchan_buf struct and associated buffer * @buf: the buffer struct */ -void relay_destroy_buf(struct rchan_buf *buf) +static void relay_destroy_buf(struct rchan_buf *buf) { struct rchan *chan = buf->chan; unsigned int i; @@ -210,7 +210,7 @@ void relay_destroy_buf(struct rchan_buf *buf) * rchan_buf_struct and the channel buffer. Should only be called from * kref_put(). */ -void relay_remove_buf(struct kref *kref) +static void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); buf->chan->cb->remove_buf_file(buf->dentry); @@ -223,11 +223,10 @@ void relay_remove_buf(struct kref *kref) * * Returns 1 if the buffer is empty, 0 otherwise. */ -int relay_buf_empty(struct rchan_buf *buf) +static int relay_buf_empty(struct rchan_buf *buf) { return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; } -EXPORT_SYMBOL_GPL(relay_buf_empty); /** * relay_buf_full - boolean, is the channel buffer full? @@ -427,6 +426,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) free_buf: relay_destroy_buf(buf); + buf = NULL; free_name: kfree(tmpname); end: diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9a87886b022..1ec620c0306 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -20,7 +20,7 @@ void down_read(struct rw_semaphore *sem) might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(down_read); @@ -47,7 +47,7 @@ void down_write(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - __down_write(sem); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(down_write); @@ -111,7 +111,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - __down_read(sem); + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } EXPORT_SYMBOL(down_read_nested); @@ -130,7 +130,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - __down_write_nested(sem, subclass); + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } EXPORT_SYMBOL(down_write_nested); diff --git a/kernel/sched.c b/kernel/sched.c index cb31fb4a137..6c10fa796ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -53,6 +53,7 @@ #include <linux/percpu.h> #include <linux/kthread.h> #include <linux/seq_file.h> +#include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> #include <linux/tsacct_kern.h> @@ -60,6 +61,7 @@ #include <linux/delayacct.h> #include <linux/reciprocal_div.h> #include <linux/unistd.h> +#include <linux/pagemap.h> #include <asm/tlb.h> @@ -261,9 +263,9 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; - - struct sched_class *load_balance_class; + u64 idle_clock; + unsigned int clock_deep_idle_events; + u64 tick_timestamp; atomic_t nr_iowait; @@ -301,7 +303,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_MUTEX(sched_hotcpu_mutex); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) @@ -319,15 +321,19 @@ static inline int cpu_of(struct rq *rq) } /* - * Per-runqueue clock, as finegrained as the platform can give us: + * Update the per-runqueue clock, as finegrained as the platform can give + * us, but without assuming monotonicity, etc.: */ -static unsigned long long __rq_clock(struct rq *rq) +static void __update_rq_clock(struct rq *rq) { u64 prev_raw = rq->prev_clock_raw; u64 now = sched_clock(); s64 delta = now - prev_raw; u64 clock = rq->clock; +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); +#endif /* * Protect against sched_clock() occasionally going backwards: */ @@ -338,8 +344,11 @@ static unsigned long long __rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(delta > 2*TICK_NSEC)) { - clock++; + if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { + if (clock < rq->tick_timestamp + TICK_NSEC) + clock = rq->tick_timestamp + TICK_NSEC; + else + clock++; rq->clock_overflows++; } else { if (unlikely(delta > rq->clock_max_delta)) @@ -350,18 +359,12 @@ static unsigned long long __rq_clock(struct rq *rq) rq->prev_clock_raw = now; rq->clock = clock; - - return clock; } -static inline unsigned long long rq_clock(struct rq *rq) +static void update_rq_clock(struct rq *rq) { - int this_cpu = smp_processor_id(); - - if (this_cpu == cpu_of(rq)) - return __rq_clock(rq); - - return rq->clock; + if (likely(smp_processor_id() == cpu_of(rq))) + __update_rq_clock(rq); } /* @@ -379,6 +382,25 @@ static inline unsigned long long rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long now; + unsigned long flags; + struct rq *rq; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); + now = rq->clock; + local_irq_restore(flags); + + return now; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* Change a task's ->cfs_rq if it moves across CPUs */ static inline void set_task_cfs_rq(struct task_struct *p) @@ -536,18 +558,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; } +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. @@ -622,27 +666,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) #define WMULT_SHIFT 32 -static inline unsigned long +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ - if (unlikely(tmp > WMULT_CONST)) { - tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) - >> (WMULT_SHIFT/2); - } else { - tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; - } + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } static inline unsigned long @@ -663,46 +711,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } -static void __update_curr_load(struct rq *rq, struct load_stat *ls) -{ - if (rq->curr != rq->idle && ls->load.weight) { - ls->delta_exec += ls->delta_stat; - ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); - ls->delta_stat = 0; - } -} - -/* - * Update delta_exec, delta_fair fields for rq. - * - * delta_fair clock advances at a rate inversely proportional to - * total load (rq->ls.load.weight) on the runqueue, while - * delta_exec advances at the same rate as wall-clock (provided - * cpu is not idle). - * - * delta_exec / delta_fair is a measure of the (smoothened) load on this - * runqueue over any given interval. This (smoothened) load is used - * during load balance. - * - * This function is called /before/ updating rq->ls.load - * and when switching tasks. - */ -static void update_curr_load(struct rq *rq, u64 now) -{ - struct load_stat *ls = &rq->ls; - u64 start; - - start = ls->load_update_start; - ls->load_update_start = now; - ls->delta_stat += now - start; - /* - * Stagger updates to ls->delta_fair. Very frequent updates - * can be expensive. - */ - if (ls->delta_stat >= sysctl_sched_stat_granularity) - __update_curr_load(rq, ls); -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -712,19 +720,6 @@ static void update_curr_load(struct rq *rq, u64 now) * slice expiry etc. */ -/* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define load_weight(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - load_weight(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) - #define WEIGHT_IDLEPRIO 2 #define WMULT_IDLEPRIO (1 << 31) @@ -741,11 +736,14 @@ static void update_curr_load(struct rq *rq, u64 now) * the relative distance between them is ~25%.) */ static const int prio_to_weight[40] = { -/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, -/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, -/* 0 */ NICE_0_LOAD /* 1024 */, -/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, -/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, }; /* @@ -756,42 +754,16 @@ static const int prio_to_weight[40] = { * into multiplications: */ static const u32 prio_to_wmult[40] = { -/* -20 */ 48356, 60446, 75558, 94446, 118058, -/* -15 */ 147573, 184467, 230589, 288233, 360285, -/* -10 */ 450347, 562979, 703746, 879575, 1099582, -/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, -/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, -/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, -/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, -/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static inline void -inc_load(struct rq *rq, const struct task_struct *p, u64 now) -{ - update_curr_load(rq, now); - update_load_add(&rq->ls.load, p->se.load.weight); -} - -static inline void -dec_load(struct rq *rq, const struct task_struct *p, u64 now) -{ - update_curr_load(rq, now); - update_load_sub(&rq->ls.load, p->se.load.weight); -} - -static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) -{ - rq->nr_running++; - inc_load(rq, p, now); -} - -static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) -{ - rq->nr_running--; - dec_load(rq, p, now); -} - static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); /* @@ -809,8 +781,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator); + int *this_best_prio, struct rq_iterator *iterator); #include "sched_stats.h" #include "sched_rt.c" @@ -822,9 +793,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, #define sched_class_highest (&rt_sched_class) +static void __update_curr_load(struct rq *rq, struct load_stat *ls) +{ + if (rq->curr != rq->idle && ls->load.weight) { + ls->delta_exec += ls->delta_stat; + ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); + ls->delta_stat = 0; + } +} + +/* + * Update delta_exec, delta_fair fields for rq. + * + * delta_fair clock advances at a rate inversely proportional to + * total load (rq->ls.load.weight) on the runqueue, while + * delta_exec advances at the same rate as wall-clock (provided + * cpu is not idle). + * + * delta_exec / delta_fair is a measure of the (smoothened) load on this + * runqueue over any given interval. This (smoothened) load is used + * during load balance. + * + * This function is called /before/ updating rq->ls.load + * and when switching tasks. + */ +static void update_curr_load(struct rq *rq) +{ + struct load_stat *ls = &rq->ls; + u64 start; + + start = ls->load_update_start; + ls->load_update_start = rq->clock; + ls->delta_stat += rq->clock - start; + /* + * Stagger updates to ls->delta_fair. Very frequent updates + * can be expensive. + */ + if (ls->delta_stat >= sysctl_sched_stat_granularity) + __update_curr_load(rq, ls); +} + +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_curr_load(rq); + update_load_add(&rq->ls.load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_curr_load(rq); + update_load_sub(&rq->ls.load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running++; + inc_load(rq, p); +} + +static void dec_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running--; + dec_load(rq, p); +} + static void set_load_weight(struct task_struct *p) { - task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; p->se.wait_runtime = 0; if (task_has_rt_policy(p)) { @@ -846,18 +880,16 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } -static void -enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup, now); + p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; } -static void -dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - p->sched_class->dequeue_task(rq, p, sleep, now); + p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -912,13 +944,11 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup, now); - inc_nr_running(p, rq, now); + enqueue_task(rq, p, wakeup); + inc_nr_running(p, rq); } /* @@ -926,13 +956,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static inline void activate_idle_task(struct task_struct *p, struct rq *rq) { - u64 now = rq_clock(rq); + update_rq_clock(rq); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - enqueue_task(rq, p, 0, now); - inc_nr_running(p, rq, now); + enqueue_task(rq, p, 0); + inc_nr_running(p, rq); } /* @@ -940,13 +970,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(rq, p, sleep, now); - dec_nr_running(p, rq, now); + dequeue_task(rq, p, sleep); + dec_nr_running(p, rq); } /** @@ -981,18 +1009,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) u64 clock_offset, fair_clock_offset; clock_offset = old_rq->clock - new_rq->clock; - fair_clock_offset = old_rq->cfs.fair_clock - - new_rq->cfs.fair_clock; - if (p->se.wait_start) - p->se.wait_start -= clock_offset; + fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; + if (p->se.wait_start_fair) p->se.wait_start_fair -= fair_clock_offset; + if (p->se.sleep_start_fair) + p->se.sleep_start_fair -= fair_clock_offset; + +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; if (p->se.sleep_start) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; - if (p->se.sleep_start_fair) - p->se.sleep_start_fair -= fair_clock_offset; +#endif __set_task_cpu(p, new_cpu); } @@ -1511,6 +1542,7 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ + update_rq_clock(rq); activate_task(rq, p, 1); /* * Sync wakeups (i.e. those types of wakeups where the waker @@ -1553,17 +1585,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) static void __sched_fork(struct task_struct *p) { p->se.wait_start_fair = 0; - p->se.wait_start = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; p->se.wait_runtime = 0; + p->se.sleep_start_fair = 0; + +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; p->se.sum_wait_runtime = 0; p->se.sum_sleep_runtime = 0; p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; p->se.block_start = 0; p->se.sleep_max = 0; p->se.block_max = 0; @@ -1571,10 +1606,15 @@ static void __sched_fork(struct task_struct *p) p->se.wait_max = 0; p->se.wait_runtime_overruns = 0; p->se.wait_runtime_underruns = 0; +#endif INIT_LIST_HEAD(&p->run_list); p->se.on_rq = 0; +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1639,11 +1679,19 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); this_cpu = smp_processor_id(); /* parent's CPU */ + update_rq_clock(rq); p->prio = effective_prio(p); - if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || - task_cpu(p) != this_cpu || !current->se.on_rq) { + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || + (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || + !current->se.on_rq) { + activate_task(rq, p, 0); } else { /* @@ -1651,14 +1699,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); task_rq_unlock(rq, &flags); } +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must @@ -1668,8 +1776,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { + fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -1711,6 +1822,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -1751,7 +1863,7 @@ context_switch(struct rq *rq, struct task_struct *prev, { struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, next); + prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -1874,7 +1986,6 @@ static void update_cpu_load(struct rq *this_rq) unsigned long total_load = this_rq->ls.load.weight; unsigned long this_load = total_load; struct load_stat *ls = &this_rq->ls; - u64 now = __rq_clock(this_rq); int i, scale; this_rq->nr_load_updates++; @@ -1882,7 +1993,7 @@ static void update_cpu_load(struct rq *this_rq) goto do_avg; /* Update delta_fair/delta_exec fields first */ - update_curr_load(this_rq, now); + update_curr_load(this_rq); fair_delta64 = ls->delta_fair + 1; ls->delta_fair = 0; @@ -1890,8 +2001,8 @@ static void update_cpu_load(struct rq *this_rq) exec_delta64 = ls->delta_exec + 1; ls->delta_exec = 0; - sample_interval64 = now - ls->load_update_last; - ls->load_update_last = now; + sample_interval64 = this_rq->clock - ls->load_update_last; + ls->load_update_last = this_rq->clock; if ((s64)sample_interval64 < (s64)TICK_NSEC) sample_interval64 = TICK_NSEC; @@ -1946,6 +2057,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) spin_lock(&rq1->lock); } } + update_rq_clock(rq1); + update_rq_clock(rq2); } /* @@ -2073,12 +2186,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if too many balance attempts have failed: - */ - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; - return 1; } @@ -2086,8 +2193,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) + int *this_best_prio, struct rq_iterator *iterator) { int pulled = 0, pinned = 0, skip_for_load; struct task_struct *p; @@ -2112,12 +2218,8 @@ next: */ skip_for_load = (p->se.load.weight >> 1) > rem_load_move + SCHED_LOAD_SCALE_FUZZ; - if (skip_for_load && p->prio < this_best_prio) - skip_for_load = !best_prio_seen && p->prio == best_prio; - if (skip_for_load || + if ((skip_for_load && p->prio >= *this_best_prio) || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - - best_prio_seen |= p->prio == best_prio; p = iterator->next(iterator->arg); goto next; } @@ -2131,8 +2233,8 @@ next: * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (p->prio < this_best_prio) - this_best_prio = p->prio; + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; p = iterator->next(iterator->arg); goto next; } @@ -2151,32 +2253,52 @@ out: } /* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, + unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { struct sched_class *class = sched_class_highest; - unsigned long load_moved, total_nr_moved = 0, nr_moved; - long rem_load_move = max_load_move; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; do { - nr_moved = class->load_balance(this_rq, this_cpu, busiest, - max_nr_move, (unsigned long)rem_load_move, - sd, idle, all_pinned, &load_moved); - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; - rem_load_move -= load_moved; + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + ULONG_MAX, max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); class = class->next; - } while (class && max_nr_move && rem_load_move > 0); + } while (class && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct sched_class *class; + int this_best_prio = MAX_PRIO; + + for (class = sched_class_highest; class; class = class->next) + if (class->load_balance(this_rq, this_cpu, busiest, + 1, ULONG_MAX, sd, idle, NULL, + &this_best_prio)) + return 1; - return total_nr_moved; + return 0; } /* @@ -2235,7 +2357,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) + if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ @@ -2257,9 +2379,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above - * domains. + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. */ - if (local_group && balance_cpu != this_cpu && balance) { + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } @@ -2393,7 +2517,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2445,10 +2569,8 @@ small_imbalance: pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ - if (pwr_move <= pwr_now) - goto out_balanced; - - *imbalance = busiest_load_per_task; + if (pwr_move > pwr_now) + *imbalance = busiest_load_per_task; } return busiest; @@ -2506,11 +2628,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 -static inline unsigned long minus_1_or_zero(unsigned long n) -{ - return n > 0 ? n - 1 : 0; -} - /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2519,7 +2636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -2560,18 +2677,17 @@ redo: schedstat_add(sd, lb_imbalance[idle], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is + * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), + ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -2579,7 +2695,7 @@ redo: /* * some other cpu did the load balance for us. */ - if (nr_moved && this_cpu != smp_processor_id()) + if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2591,7 +2707,7 @@ redo: } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2640,10 +2756,10 @@ redo: sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -2675,8 +2791,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) struct sched_group *group; struct rq *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int ld_moved = 0; int sd_idle = 0; + int all_pinned = 0; cpumask_t cpus = CPU_MASK_ALL; /* @@ -2709,23 +2826,25 @@ redo: schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), - imbalance, sd, CPU_NEWLY_IDLE, NULL); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); spin_unlock(&busiest->lock); - if (!nr_moved) { + if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) @@ -2733,7 +2852,7 @@ redo: } else sd->nr_balance_failed = 0; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); @@ -2810,6 +2929,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -2821,9 +2942,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) if (likely(sd)) { schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, - NULL)) + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2921,6 +3041,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2957,8 +3078,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -2968,7 +3091,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -3007,7 +3137,7 @@ static void run_rebalance_domains(struct softirq_action *h) if (need_resched()) break; - rebalance_domains(balance_cpu, SCHED_IDLE); + rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) @@ -3092,8 +3222,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) + int *this_best_prio, struct rq_iterator *iterator) { *load_moved = 0; @@ -3119,7 +3248,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime; if (rq->curr == p) { - delta_exec = rq_clock(rq) - p->se.exec_start; + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) ns += delta_exec; } @@ -3213,11 +3343,19 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + u64 next_tick = rq->tick_timestamp + TICK_NSEC; spin_lock(&rq->lock); + __update_rq_clock(rq); + /* + * Let rq->clock advance by at least TICK_NSEC: + */ + if (unlikely(rq->clock < next_tick)) + rq->clock = next_tick; + rq->tick_timestamp = rq->clock; + update_cpu_load(rq); if (curr != rq->idle) /* FIXME: needed? */ curr->sched_class->task_tick(rq, curr); - update_cpu_load(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3299,7 +3437,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) +pick_next_task(struct rq *rq, struct task_struct *prev) { struct sched_class *class; struct task_struct *p; @@ -3309,14 +3447,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq, now); + p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } class = sched_class_highest; for ( ; ; ) { - p = class->pick_next_task(rq, now); + p = class->pick_next_task(rq); if (p) return p; /* @@ -3335,7 +3473,6 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; long *switch_count; struct rq *rq; - u64 now; int cpu; need_resched: @@ -3353,6 +3490,7 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + __update_rq_clock(rq); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3367,9 +3505,8 @@ need_resched_nonpreemptible: if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - now = __rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev, now); - next = pick_next_task(rq, prev, now); + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); sched_info_switch(prev, next); @@ -3812,17 +3949,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq; struct rq *rq; - u64 now; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - now = rq_clock(rq); + update_rq_clock(rq); oldprio = p->prio; on_rq = p->se.on_rq; if (on_rq) - dequeue_task(rq, p, 0, now); + dequeue_task(rq, p, 0); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3832,7 +3968,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; if (on_rq) { - enqueue_task(rq, p, 0, now); + enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3855,7 +3991,6 @@ void set_user_nice(struct task_struct *p, long nice) int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; - u64 now; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3864,7 +3999,7 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); - now = rq_clock(rq); + update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3877,8 +4012,8 @@ void set_user_nice(struct task_struct *p, long nice) } on_rq = p->se.on_rq; if (on_rq) { - dequeue_task(rq, p, 0, now); - dec_load(rq, p, now); + dequeue_task(rq, p, 0); + dec_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); @@ -3888,8 +4023,8 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0, now); - inc_load(rq, p, now); + enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4125,6 +4260,7 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq, p, 0); @@ -4380,10 +4516,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); mutex_unlock(&sched_hotcpu_mutex); - if (retval) - return retval; - return 0; + return retval; } /** @@ -4422,10 +4556,7 @@ asmlinkage long sys_sched_yield(void) struct rq *rq = this_rq_lock(); schedstat_inc(rq, yld_cnt); - if (unlikely(rq->nr_running == 1)) - schedstat_inc(rq, yld_act_empty); - else - current->sched_class->yield_task(rq, current); + current->sched_class->yield_task(rq, current); /* * Since we are going to call schedule() anyway, there's @@ -4781,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; } #ifdef CONFIG_SMP @@ -4883,6 +5018,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); @@ -5115,14 +5251,137 @@ static void migrate_dead_tasks(unsigned int dead_cpu) for ( ; ; ) { if (!rq->nr_running) break; - next = pick_next_task(rq, rq->curr, rq_clock(rq)); + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); if (!next) break; migrate_dead(dead_cpu, next); + } } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0,}, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[12], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} +#else +static void init_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5179,6 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); + update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); @@ -6101,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static int arch_reinit_sched_domains(void) { int err; @@ -6130,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) return ret ? ret : count; } -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif - #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) { @@ -6158,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 0); } -SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT @@ -6172,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 1); } -SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, - sched_smt_power_savings_store); +static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} #endif /* @@ -6228,6 +6488,8 @@ void __init sched_init_smp(void) /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_sched_domain_sysctl(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); @@ -6314,6 +6576,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); @@ -6379,12 +6645,14 @@ void normalize_rt_tasks(void) do_each_thread(g, p) { p->se.fair_key = 0; p->se.wait_runtime = 0; + p->se.exec_start = 0; p->se.wait_start_fair = 0; + p->se.sleep_start_fair = 0; +#ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; - p->se.exec_start = 0; p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; p->se.block_start = 0; +#endif task_rq(p)->cfs.fair_clock = 0; task_rq(p)->clock = 0; @@ -6408,12 +6676,13 @@ void normalize_rt_tasks(void) goto out_unlock; #endif + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) - deactivate_task(task_rq(p), p, 0); + deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(task_rq(p), p, 0); + activate_task(rq, p, 0); resched_task(rq->curr); } #ifdef CONFIG_SMP diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 29f2c21e7da..c3ee38bd342 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -29,29 +29,34 @@ } while (0) static void -print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now) +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { if (rq->curr == p) SEQ_printf(m, "R"); else SEQ_printf(m, " "); - SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d " - "%15Ld %15Ld %15Ld %15Ld %15Ld\n", + SEQ_printf(m, "%15s %5d %15Ld %13Ld %13Ld %9Ld %5d ", p->comm, p->pid, (long long)p->se.fair_key, (long long)(p->se.fair_key - rq->cfs.fair_clock), (long long)p->se.wait_runtime, (long long)(p->nvcsw + p->nivcsw), - p->prio, + p->prio); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", (long long)p->se.sum_exec_runtime, (long long)p->se.sum_wait_runtime, (long long)p->se.sum_sleep_runtime, (long long)p->se.wait_runtime_overruns, (long long)p->se.wait_runtime_underruns); +#else + SEQ_printf(m, "%15Ld %15Ld %15Ld %15Ld %15Ld\n", + 0LL, 0LL, 0LL, 0LL, 0LL); +#endif } -static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; @@ -72,7 +77,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now) if (!p->se.on_rq || task_cpu(p) != rq_cpu) continue; - print_task(m, rq, p, now); + print_task(m, rq, p); } while_each_thread(g, p); read_unlock_irq(&tasklist_lock); @@ -101,9 +106,9 @@ print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) (long long)wait_runtime_rq_sum); } -void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq); + SEQ_printf(m, "\ncfs_rq\n"); #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(cfs_rq->x)) @@ -119,7 +124,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) print_cfs_rq_runtime_sum(m, cpu, cfs_rq); } -static void print_cpu(struct seq_file *m, int cpu, u64 now) +static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = &per_cpu(runqueues, cpu); @@ -149,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(next_balance); P(curr->pid); P(clock); + P(idle_clock); P(prev_clock_raw); P(clock_warps); P(clock_overflows); - P(clock_unstable_events); + P(clock_deep_idle_events); P(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); @@ -161,9 +167,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(cpu_load[4]); #undef P - print_cfs_stats(m, cpu, now); + print_cfs_stats(m, cpu); - print_rq(m, rq, cpu, now); + print_rq(m, rq, cpu); } static int sched_debug_show(struct seq_file *m, void *v) @@ -171,7 +177,7 @@ static int sched_debug_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -179,14 +185,14 @@ static int sched_debug_show(struct seq_file *m, void *v) SEQ_printf(m, "now at %Lu nsecs\n", (unsigned long long)now); for_each_online_cpu(cpu) - print_cpu(m, cpu, now); + print_cpu(m, cpu); SEQ_printf(m, "\n"); return 0; } -void sysrq_sched_debug_show(void) +static void sysrq_sched_debug_show(void) { sched_debug_show(NULL, NULL); } @@ -200,7 +206,7 @@ static struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; static int __init init_sched_debug_procfs(void) @@ -235,21 +241,24 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #define P(F) \ SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F) - P(se.wait_start); + P(se.wait_runtime); P(se.wait_start_fair); P(se.exec_start); - P(se.sleep_start); P(se.sleep_start_fair); + P(se.sum_exec_runtime); + +#ifdef CONFIG_SCHEDSTATS + P(se.wait_start); + P(se.sleep_start); P(se.block_start); P(se.sleep_max); P(se.block_max); P(se.exec_max); P(se.wait_max); - P(se.wait_runtime); P(se.wait_runtime_overruns); P(se.wait_runtime_underruns); P(se.sum_wait_runtime); - P(se.sum_exec_runtime); +#endif SEQ_printf(m, "%-25s:%20Ld\n", "nr_switches", (long long)(p->nvcsw + p->nivcsw)); P(se.load.weight); @@ -269,7 +278,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) void proc_sched_set_task(struct task_struct *p) { +#ifdef CONFIG_SCHEDSTATS p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0; p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0; +#endif p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6971db0a716..67c67a87146 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,34 +15,50 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> */ /* - * Preemption granularity: - * (default: 2 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: + */ +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; +unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; + +/* + * sys_sched_yield() compat mode + * + * This option switches the agressive yield implementation of the + * old scheduler back on. + */ +unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 25 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = - 10000000000ULL/HZ; +unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; /* * SCHED_OTHER wake-up granularity. @@ -52,12 +68,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; unsigned int sysctl_sched_stat_granularity __read_mostly; /* - * Initialized in sched_init_granularity(): + * Initialized in sched_init_granularity() [to 5 times the base granularity]: */ unsigned int sysctl_sched_runtime_limit __read_mostly; @@ -75,7 +91,7 @@ enum { unsigned int sysctl_sched_features __read_mostly = SCHED_FEAT_FAIR_SLEEPERS *1 | - SCHED_FEAT_SLEEPER_AVG *1 | + SCHED_FEAT_SLEEPER_AVG *0 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | SCHED_FEAT_START_DEBIT *1 | @@ -186,6 +202,8 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static inline void @@ -197,6 +215,8 @@ __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + + schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); } static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) @@ -214,6 +234,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) */ /* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_min_granularity); + } + + return gran; +} + +/* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: */ @@ -222,21 +285,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity) { u64 tmp; + if (likely(curr->load.weight == NICE_0_LOAD)) + return granularity; /* - * Negative nice levels get the same granularity as nice-0: + * Positive nice levels get the same granularity as nice-0: */ - if (likely(curr->load.weight >= NICE_0_LOAD)) - return granularity; + if (likely(curr->load.weight < NICE_0_LOAD)) { + tmp = curr->load.weight * (u64)granularity; + return (long) (tmp >> NICE_0_SHIFT); + } /* - * Positive nice level tasks get linearly finer + * Negative nice level tasks get linearly finer * granularity: */ - tmp = curr->load.weight * (u64)granularity; + tmp = curr->load.inv_weight * (u64)granularity; /* * It will always fit into 'long': */ - return (long) (tmp >> NICE_0_SHIFT); + return (long) (tmp >> (WMULT_SHIFT-NICE_0_SHIFT)); } static inline void @@ -281,34 +348,28 @@ add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) * are not in our scheduling class. */ static inline void -__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - unsigned long delta, delta_exec, delta_fair; - long delta_mine; + unsigned long delta, delta_exec, delta_fair, delta_mine; struct load_weight *lw = &cfs_rq->load; unsigned long load = lw->weight; - if (unlikely(!load)) - return; - delta_exec = curr->delta_exec; -#ifdef CONFIG_SCHEDSTATS - if (unlikely(delta_exec > curr->exec_max)) - curr->exec_max = delta_exec; -#endif + schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; cfs_rq->exec_clock += delta_exec; + if (unlikely(!load)) + return; + delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) { - delta = calc_delta_mine(cfs_rq->sleeper_bonus, - curr->load.weight, lw); - if (unlikely(delta > cfs_rq->sleeper_bonus)) - delta = cfs_rq->sleeper_bonus; - + if (cfs_rq->sleeper_bonus > sysctl_sched_min_granularity) { + delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); + delta = min(delta, (unsigned long)( + (long)sysctl_sched_runtime_limit - curr->wait_runtime)); cfs_rq->sleeper_bonus -= delta; delta_mine -= delta; } @@ -324,7 +385,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); } -static void update_curr(struct cfs_rq *cfs_rq, u64 now) +static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq_curr(cfs_rq); unsigned long delta_exec; @@ -337,22 +398,22 @@ static void update_curr(struct cfs_rq *cfs_rq, u64 now) * since the last time we changed load (this cannot * overflow on 32 bits): */ - delta_exec = (unsigned long)(now - curr->exec_start); + delta_exec = (unsigned long)(rq_of(cfs_rq)->clock - curr->exec_start); curr->delta_exec += delta_exec; if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { - __update_curr(cfs_rq, curr, now); + __update_curr(cfs_rq, curr); curr->delta_exec = 0; } - curr->exec_start = now; + curr->exec_start = rq_of(cfs_rq)->clock; } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { se->wait_start_fair = cfs_rq->fair_clock; - se->wait_start = now; + schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); } /* @@ -380,8 +441,7 @@ calc_weighted(unsigned long delta, unsigned long weight, int shift) /* * Task is being enqueued - update stats: */ -static void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { s64 key; @@ -390,7 +450,7 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq_curr(cfs_rq)) - update_stats_wait_start(cfs_rq, se, now); + update_stats_wait_start(cfs_rq, se); /* * Update the key: */ @@ -410,7 +470,8 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) (WMULT_SHIFT - NICE_0_SHIFT); } else { tmp = se->wait_runtime; - key -= (tmp * se->load.weight) >> NICE_0_SHIFT; + key -= (tmp * se->load.inv_weight) >> + (WMULT_SHIFT - NICE_0_SHIFT); } } @@ -421,17 +482,12 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) * Note: must be called with a freshly updated rq->fair_clock. */ static inline void -__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +__update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long delta_fair = se->delta_fair_run; -#ifdef CONFIG_SCHEDSTATS - { - s64 delta_wait = now - se->wait_start; - if (unlikely(delta_wait > se->wait_max)) - se->wait_max = delta_wait; - } -#endif + schedstat_set(se->wait_max, max(se->wait_max, + rq_of(cfs_rq)->clock - se->wait_start)); if (unlikely(se->load.weight != NICE_0_LOAD)) delta_fair = calc_weighted(delta_fair, se->load.weight, @@ -441,53 +497,56 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) } static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long delta_fair; + if (unlikely(!se->wait_start_fair)) + return; + delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), (u64)(cfs_rq->fair_clock - se->wait_start_fair)); se->delta_fair_run += delta_fair; if (unlikely(abs(se->delta_fair_run) >= sysctl_sched_stat_granularity)) { - __update_stats_wait_end(cfs_rq, se, now); + __update_stats_wait_end(cfs_rq, se); se->delta_fair_run = 0; } se->wait_start_fair = 0; - se->wait_start = 0; + schedstat_set(se->wait_start, 0); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_curr(cfs_rq, now); + update_curr(cfs_rq); /* * Mark the end of the wait period if dequeueing a * waiting task: */ if (se != cfs_rq_curr(cfs_rq)) - update_stats_wait_end(cfs_rq, se, now); + update_stats_wait_end(cfs_rq, se); } /* * We are picking a new current task - update its stats: */ static inline void -update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * We are starting a new run period: */ - se->exec_start = now; + se->exec_start = rq_of(cfs_rq)->clock; } /* * We are descheduling a task - update its stats: */ static inline void -update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { se->exec_start = 0; } @@ -496,12 +555,18 @@ update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) * Scheduling class queueing methods: */ -static void -__enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned long load = cfs_rq->load.weight, delta_fair; long prev_runtime; + /* + * Do not boost sleepers if there's too much bonus 'in flight' + * already: + */ + if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) + return; + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) load = rq_of(cfs_rq)->cpu_load[2]; @@ -527,12 +592,9 @@ __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) * Track the amount of bonus we've given to sleepers: */ cfs_rq->sleeper_bonus += delta_fair; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } -static void -enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct task_struct *tsk = task_of(se); unsigned long delta_fair; @@ -547,7 +609,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) se->delta_fair_sleep += delta_fair; if (unlikely(abs(se->delta_fair_sleep) >= sysctl_sched_stat_granularity)) { - __enqueue_sleeper(cfs_rq, se, now); + __enqueue_sleeper(cfs_rq, se); se->delta_fair_sleep = 0; } @@ -555,7 +617,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { - u64 delta = now - se->sleep_start; + u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; if ((s64)delta < 0) delta = 0; @@ -567,7 +629,7 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) se->sum_sleep_runtime += delta; } if (se->block_start) { - u64 delta = now - se->block_start; + u64 delta = rq_of(cfs_rq)->clock - se->block_start; if ((s64)delta < 0) delta = 0; @@ -577,31 +639,39 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) se->block_start = 0; se->sum_sleep_runtime += delta; + + /* + * Blocking time is in units of nanosecs, so shift by 20 to + * get a milliseconds-range estimation of the amount of + * time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), + delta >> 20); + } } #endif } static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - int wakeup, u64 now) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) { /* * Update the fair clock. */ - update_curr(cfs_rq, now); + update_curr(cfs_rq); if (wakeup) - enqueue_sleeper(cfs_rq, se, now); + enqueue_sleeper(cfs_rq, se); - update_stats_enqueue(cfs_rq, se, now); + update_stats_enqueue(cfs_rq, se); __enqueue_entity(cfs_rq, se); } static void -dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - int sleep, u64 now) +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { - update_stats_dequeue(cfs_rq, se, now); + update_stats_dequeue(cfs_rq, se); if (sleep) { se->sleep_start_fair = cfs_rq->fair_clock; #ifdef CONFIG_SCHEDSTATS @@ -609,11 +679,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - se->sleep_start = now; + se->sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) - se->block_start = now; + se->block_start = rq_of(cfs_rq)->clock; } - cfs_rq->wait_runtime -= se->wait_runtime; #endif } __dequeue_entity(cfs_rq, se); @@ -627,18 +696,38 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, struct sched_entity *curr, unsigned long granularity) { s64 __delta = curr->fair_key - se->fair_key; + unsigned long ideal_runtime, delta_exec; + + /* + * ideal_runtime is compared against sum_exec_runtime, which is + * walltime, hence do not scale. + */ + ideal_runtime = max(sysctl_sched_latency / cfs_rq->nr_running, + (unsigned long)sysctl_sched_min_granularity); + + /* + * If we executed more than what the latency constraint suggests, + * reduce the rescheduling granularity. This way the total latency + * of how much a task is not scheduled converges to + * sysctl_sched_latency: + */ + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) + granularity = 0; /* * Take scheduling granularity into account - do not * preempt the current task unless the best task has * a larger than sched_granularity fairness advantage: + * + * scale granularity as key space is in fair_clock. */ if (__delta > niced_granularity(curr, granularity)) resched_task(rq_of(cfs_rq)->curr); } static inline void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Any task has to be enqueued before it get to execute on @@ -647,49 +736,47 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) * done a put_prev_task_fair() shortly before this, which * updated rq->fair_clock - used by update_stats_wait_end()) */ - update_stats_wait_end(cfs_rq, se, now); - update_stats_curr_start(cfs_rq, se, now); + update_stats_wait_end(cfs_rq, se); + update_stats_curr_start(cfs_rq, se); set_cfs_rq_curr(cfs_rq, se); + se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); - set_next_entity(cfs_rq, se, now); + set_next_entity(cfs_rq, se); return se; } -static void -put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ if (prev->on_rq) - update_curr(cfs_rq, now); + update_curr(cfs_rq); - update_stats_curr_end(cfs_rq, prev, now); + update_stats_curr_end(cfs_rq, prev); if (prev->on_rq) - update_stats_wait_start(cfs_rq, prev, now); + update_stats_wait_start(cfs_rq, prev); set_cfs_rq_curr(cfs_rq, NULL); } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct rq *rq = rq_of(cfs_rq); struct sched_entity *next; - u64 now = __rq_clock(rq); /* * Dequeue and enqueue the task to update its * position within the tree: */ - dequeue_entity(cfs_rq, curr, 0, now); - enqueue_entity(cfs_rq, curr, 0, now); + dequeue_entity(cfs_rq, curr, 0); + enqueue_entity(cfs_rq, curr, 0); /* * Reschedule if another task tops the current one. @@ -698,7 +785,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -794,8 +882,7 @@ static inline int is_same_group(struct task_struct *curr, struct task_struct *p) * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ -static void -enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; @@ -804,7 +891,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup, now); + enqueue_entity(cfs_rq, se, wakeup); } } @@ -813,15 +900,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ -static void -dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, sleep, now); + dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; @@ -829,19 +915,62 @@ dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) } /* - * sched_yield() support is very simple - we dequeue and enqueue + * sched_yield() support is very simple - we dequeue and enqueue. + * + * If compat_yield is turned on then we requeue to the end of the tree. */ static void yield_task_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - u64 now = __rq_clock(rq); + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct sched_entity *rightmost, *se = &p->se; + struct rb_node *parent; /* - * Dequeue and enqueue the task to update its - * position within the tree: + * Are we the only task in the tree? + */ + if (unlikely(cfs_rq->nr_running == 1)) + return; + + if (likely(!sysctl_sched_compat_yield)) { + __update_rq_clock(rq); + /* + * Dequeue and enqueue the task to update its + * position within the tree: + */ + dequeue_entity(cfs_rq, &p->se, 0); + enqueue_entity(cfs_rq, &p->se, 0); + + return; + } + /* + * Find the rightmost entry in the rbtree: + */ + do { + parent = *link; + link = &parent->rb_right; + } while (*link); + + rightmost = rb_entry(parent, struct sched_entity, run_node); + /* + * Already in the rightmost position? */ - dequeue_entity(cfs_rq, &p->se, 0, now); - enqueue_entity(cfs_rq, &p->se, 0, now); + if (unlikely(rightmost == se)) + return; + + /* + * Minimally necessary key value to be last in the tree: + */ + se->fair_key = rightmost->fair_key + 1; + + if (cfs_rq->rb_leftmost == &se->run_node) + cfs_rq->rb_leftmost = rb_next(&se->run_node); + /* + * Relink the task to the rightmost position: + */ + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); } /* @@ -854,7 +983,8 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) unsigned long gran; if (unlikely(rt_prio(p->prio))) { - update_curr(cfs_rq, rq_clock(rq)); + update_rq_clock(rq); + update_curr(cfs_rq); resched_task(curr); return; } @@ -870,7 +1000,7 @@ static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); } -static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) +static struct task_struct *pick_next_task_fair(struct rq *rq) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -879,7 +1009,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) return NULL; do { - se = pick_next_entity(cfs_rq, now); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -889,14 +1019,14 @@ static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) { struct sched_entity *se = &prev->se; struct cfs_rq *cfs_rq; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - put_prev_entity(cfs_rq, se, now); + put_prev_entity(cfs_rq, se); } } @@ -939,6 +1069,7 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } +#ifdef CONFIG_FAIR_GROUP_SCHED static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) { struct sched_entity *curr; @@ -952,12 +1083,13 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) return p->prio; } +#endif -static int +static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *total_load_moved) + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) { struct cfs_rq *busy_cfs_rq; unsigned long load_moved, total_nr_moved = 0, nr_moved; @@ -968,15 +1100,14 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, cfs_rq_iterator.next = load_balance_next_fair; for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { +#ifdef CONFIG_FAIR_GROUP_SCHED struct cfs_rq *this_cfs_rq; long imbalance; unsigned long maxload; - int this_best_prio, best_prio, best_prio_seen = 0; this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - imbalance = busy_cfs_rq->load.weight - - this_cfs_rq->load.weight; + imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ if (imbalance <= 0) continue; @@ -985,27 +1116,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, imbalance /= 2; maxload = min(rem_load_move, imbalance); - this_best_prio = cfs_rq_best_prio(this_cfs_rq); - best_prio = cfs_rq_best_prio(busy_cfs_rq); - - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) - * of any task we find with that prio. - */ - if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) - best_prio_seen = 1; - + *this_best_prio = cfs_rq_best_prio(this_cfs_rq); +#else +# define maxload rem_load_move +#endif /* pass busy_cfs_rq argument into * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, maxload, sd, idle, all_pinned, - &load_moved, this_best_prio, best_prio, - best_prio_seen, &cfs_rq_iterator); + &load_moved, this_best_prio, &cfs_rq_iterator); total_nr_moved += nr_moved; max_nr_move -= nr_moved; @@ -1015,9 +1136,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, break; } - *total_load_moved = max_load_move - rem_load_move; - - return total_nr_moved; + return max_load_move - rem_load_move; } /* @@ -1044,35 +1163,34 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr) static void task_new_fair(struct rq *rq, struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); - struct sched_entity *se = &p->se; - u64 now = rq_clock(rq); + struct sched_entity *se = &p->se, *curr = cfs_rq_curr(cfs_rq); sched_info_queued(p); - update_stats_enqueue(cfs_rq, se, now); + update_curr(cfs_rq); + update_stats_enqueue(cfs_rq, se); /* * Child runs first: we let it run before the parent * until it reschedules once. We set up the key so that * it will preempt the parent: */ - p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + se->fair_key = curr->fair_key - + niced_granularity(curr, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: */ if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) - p->se.wait_start_fair = 0; + se->wait_start_fair = 0; /* * The statistical average of wait_runtime is about * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sysctl_sched_granularity / 2); + se->wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); - inc_nr_running(p, rq, now); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1083,15 +1201,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) */ static void set_curr_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; - struct sched_entity *se = &curr->se; - u64 now = rq_clock(rq); - struct cfs_rq *cfs_rq; + struct sched_entity *se = &rq->curr->se; - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se, now); - } + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); } #else static void set_curr_task_fair(struct rq *rq) @@ -1120,12 +1233,11 @@ struct sched_class fair_sched_class __read_mostly = { }; #ifdef CONFIG_SCHED_DEBUG -void print_cfs_stats(struct seq_file *m, int cpu, u64 now) +static void print_cfs_stats(struct seq_file *m, int cpu) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(rq, cfs_rq) - print_cfs_rq(m, cpu, cfs_rq, now); + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq(m, cpu, cfs_rq); } #endif diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 41841e741c4..3503fb2d9f9 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -13,7 +13,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) resched_task(rq->idle); } -static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) +static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); @@ -25,7 +25,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) * message if some code attempts to do it: */ static void -dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) +dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { spin_unlock_irq(&rq->lock); printk(KERN_ERR "bad: scheduling from the idle thread!\n"); @@ -33,15 +33,15 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) spin_lock_irq(&rq->lock); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { } -static int +static unsigned long load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *total_load_moved) + int *all_pinned, int *this_best_prio) { return 0; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 1192a2741b9..4b87476a02d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -7,7 +7,7 @@ * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ -static inline void update_curr_rt(struct rq *rq, u64 now) +static inline void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; u64 delta_exec; @@ -15,18 +15,17 @@ static inline void update_curr_rt(struct rq *rq, u64 now) if (!task_has_rt_policy(curr)) return; - delta_exec = now - curr->se.exec_start; + delta_exec = rq->clock - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - if (unlikely(delta_exec > curr->se.exec_max)) - curr->se.exec_max = delta_exec; + + schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; - curr->se.exec_start = now; + curr->se.exec_start = rq->clock; } -static void -enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) { struct rt_prio_array *array = &rq->rt.active; @@ -37,12 +36,11 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) /* * Adding/removing a task to/from a priority array: */ -static void -dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) { struct rt_prio_array *array = &rq->rt.active; - update_curr_rt(rq, now); + update_curr_rt(rq); list_del(&p->run_list); if (list_empty(array->queue + p->prio)) @@ -75,7 +73,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } -static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) +static struct task_struct *pick_next_task_rt(struct rq *rq) { struct rt_prio_array *array = &rq->rt.active; struct task_struct *next; @@ -89,14 +87,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); - next->se.exec_start = now; + next->se.exec_start = rq->clock; return next; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { - update_curr_rt(rq, now); + update_curr_rt(rq); p->se.exec_start = 0; } @@ -172,28 +170,15 @@ static struct task_struct *load_balance_next_rt(void *arg) return p; } -static int +static unsigned long load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned, unsigned long *load_moved) + int *all_pinned, int *this_best_prio) { - int this_best_prio, best_prio, best_prio_seen = 0; int nr_moved; struct rq_iterator rt_rq_iterator; - - best_prio = sched_find_first_bit(busiest->rt.active.bitmap); - this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); - - /* - * Enable handling of the case where there is more than one task - * with the best priority. If the current running task is one - * of those with prio==best_prio we know it won't be moved - * and therefore it's safe to override the skip (based on load) - * of any task we find with that prio. - */ - if (busiest->curr->prio == best_prio) - best_prio_seen = 1; + unsigned long load_moved; rt_rq_iterator.start = load_balance_start_rt; rt_rq_iterator.next = load_balance_next_rt; @@ -203,11 +188,10 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, rt_rq_iterator.arg = busiest; nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, - max_load_move, sd, idle, all_pinned, load_moved, - this_best_prio, best_prio, best_prio_seen, - &rt_rq_iterator); + max_load_move, sd, idle, all_pinned, &load_moved, + this_best_prio, &rt_rq_iterator); - return nr_moved; + return load_moved; } static void task_tick_rt(struct rq *rq, struct task_struct *p) @@ -223,19 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) return; p->time_slice = static_prio_timeslice(p->static_prio); - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - requeue_task_rt(rq, p); -} -/* - * No parent/child timeslice management necessary for RT tasks, - * just activate them: - */ -static void task_new_rt(struct rq *rq, struct task_struct *p) -{ - activate_task(rq, p, 1); + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->run_list.prev != p->run_list.next) { + requeue_task_rt(rq, p); + set_tsk_need_resched(p); + } } static struct sched_class rt_sched_class __read_mostly = { @@ -251,5 +231,4 @@ static struct sched_class rt_sched_class __read_mostly = { .load_balance = load_balance_rt, .task_tick = task_tick_rt, - .task_new = task_new_rt, }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index c63c38f6fa6..c20a94dda61 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) } # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val) do { var = (val); } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) +# define schedstat_set(var, val) do { } while (0) #endif #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) diff --git a/kernel/signal.c b/kernel/signal.c index 39d122753ba..79295238109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -255,6 +255,16 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } +int unhandled_signal(struct task_struct *tsk, int sig) +{ + if (is_init(tsk)) + return 1; + if (tsk->ptrace & PT_PTRACED) + return 0; + return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || + (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); +} + /* Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be @@ -368,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - if (tsk == current) - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info); @@ -397,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } - if (likely(tsk == current)) - recalc_sigpending(); + recalc_sigpending(); if (signr && unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our @@ -415,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if ( signr && + if (signr && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ /* @@ -523,18 +531,18 @@ static int check_kill_permission(int sig, struct siginfo *info, if (!valid_signal(sig)) return error; - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - - error = -EPERM; - if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) - && ((sig != SIGCONT) || - (process_session(current) != process_session(t))) - && (current->euid ^ t->suid) && (current->euid ^ t->uid) - && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_KILL)) + if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) + return error; + error = -EPERM; + if (((sig != SIGCONT) || + (process_session(current) != process_session(t))) + && (current->euid ^ t->suid) && (current->euid ^ t->uid) + && (current->uid ^ t->suid) && (current->uid ^ t->uid) + && !capable(CAP_KILL)) return error; + } return security_task_kill(t, info, sig, 0); } @@ -1290,20 +1298,19 @@ struct sigqueue *sigqueue_alloc(void) void sigqueue_free(struct sigqueue *q) { unsigned long flags; + spinlock_t *lock = ¤t->sighand->siglock; + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* * If the signal is still pending remove it from the - * pending queue. + * pending queue. We must hold ->siglock while testing + * q->list to serialize with collect_signal(). */ - if (unlikely(!list_empty(&q->list))) { - spinlock_t *lock = ¤t->sighand->siglock; - read_lock(&tasklist_lock); - spin_lock_irqsave(lock, flags); - if (!list_empty(&q->list)) - list_del_init(&q->list); - spin_unlock_irqrestore(lock, flags); - read_unlock(&tasklist_lock); - } + spin_lock_irqsave(lock, flags); + if (!list_empty(&q->list)) + list_del_init(&q->list); + spin_unlock_irqrestore(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; __sigqueue_free(q); } @@ -1551,10 +1558,6 @@ static inline int may_ptrace_stop(void) (current->ptrace & PT_ATTACHED))) return 0; - if (unlikely(current->signal == current->parent->signal) && - unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) - return 0; - /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f546ddea43..bd89bc4eb0b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -271,8 +271,6 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } -EXPORT_SYMBOL(do_softirq); - #endif /* @@ -332,8 +330,6 @@ inline fastcall void raise_softirq_irqoff(unsigned int nr) wakeup_softirqd(); } -EXPORT_SYMBOL(raise_softirq_irqoff); - void fastcall raise_softirq(unsigned int nr) { unsigned long flags; diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 2c6c2bf8551..cd72424c266 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -72,7 +72,7 @@ void __lockfunc _read_lock(rwlock_t *lock) { preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock); @@ -88,8 +88,8 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_LOCKING - _raw_spin_lock(lock); +#ifdef CONFIG_LOCKDEP + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif @@ -102,7 +102,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock) local_irq_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_irq); @@ -111,7 +111,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock) local_bh_disable(); preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_bh); @@ -122,7 +122,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); return flags; } EXPORT_SYMBOL(_read_lock_irqsave); @@ -132,7 +132,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock) local_irq_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock_irq); @@ -141,7 +141,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock) local_bh_disable(); preempt_disable(); rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_); - _raw_read_lock(lock); + LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock); } EXPORT_SYMBOL(_read_lock_bh); @@ -152,7 +152,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) local_irq_save(flags); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); return flags; } EXPORT_SYMBOL(_write_lock_irqsave); @@ -162,7 +162,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock) local_irq_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock_irq); @@ -171,7 +171,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock) local_bh_disable(); preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock_bh); @@ -179,7 +179,7 @@ void __lockfunc _spin_lock(spinlock_t *lock) { preempt_disable(); spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock); @@ -188,7 +188,7 @@ void __lockfunc _write_lock(rwlock_t *lock) { preempt_disable(); rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_); - _raw_write_lock(lock); + LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock); } EXPORT_SYMBOL(_write_lock); @@ -289,7 +289,7 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) { preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - _raw_spin_lock(lock); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } EXPORT_SYMBOL(_spin_lock_nested); @@ -305,8 +305,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas * _raw_spin_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ -#ifdef CONFIG_PROVE_SPIN_LOCKING - _raw_spin_lock(lock); +#ifdef CONFIG_LOCKDEP + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); #else _raw_spin_lock_flags(lock, &flags); #endif diff --git a/kernel/sys.c b/kernel/sys.c index 4d141ae3e80..8ae2e636eb1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include <linux/getcpu.h> #include <linux/task_io_accounting_ops.h> #include <linux/seccomp.h> +#include <linux/cpu.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -100,6 +101,13 @@ struct pid *cad_pid; EXPORT_SYMBOL(cad_pid); /* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); +EXPORT_SYMBOL(pm_power_off_prepare); + +/* * Notifier list for kernel code which wants to be called * at shutdown. This is used to stop any idling DMA operations * and the like. @@ -797,6 +805,7 @@ static void kernel_restart_prepare(char *cmd) blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; device_shutdown(); + sysdev_shutdown(); } /** @@ -853,6 +862,7 @@ void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); + sysdev_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); } @@ -867,6 +877,10 @@ EXPORT_SYMBOL_GPL(kernel_halt); void kernel_power_off(void) { kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + disable_nonboot_cpus(); + sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); } @@ -942,7 +956,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user unlock_kernel(); return -EINVAL; -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: { int ret = hibernate(); @@ -1027,7 +1041,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) return -EPERM; } if (new_egid != old_egid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } if (rgid != (gid_t) -1 || @@ -1057,13 +1071,13 @@ asmlinkage long sys_setgid(gid_t gid) if (capable(CAP_SETGID)) { if (old_egid != gid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->gid = current->egid = current->sgid = current->fsgid = gid; } else if ((gid == current->gid) || (gid == current->sgid)) { if (old_egid != gid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->egid = current->fsgid = gid; @@ -1094,7 +1108,7 @@ static int set_user(uid_t new_ruid, int dumpclear) switch_uid(new_user); if (dumpclear) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->uid = new_ruid; @@ -1150,7 +1164,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) return -EAGAIN; if (new_euid != old_euid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = current->euid = new_euid; @@ -1200,7 +1214,7 @@ asmlinkage long sys_setuid(uid_t uid) return -EPERM; if (old_euid != uid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = current->euid = uid; @@ -1245,7 +1259,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) } if (euid != (uid_t) -1) { if (euid != current->euid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->euid = euid; @@ -1295,7 +1309,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) } if (egid != (gid_t) -1) { if (egid != current->egid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->egid = egid; @@ -1341,7 +1355,7 @@ asmlinkage long sys_setfsuid(uid_t uid) uid == current->suid || uid == current->fsuid || capable(CAP_SETUID)) { if (uid != old_fsuid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsuid = uid; @@ -1370,7 +1384,7 @@ asmlinkage long sys_setfsgid(gid_t gid) gid == current->sgid || gid == current->fsgid || capable(CAP_SETGID)) { if (gid != old_fsgid) { - current->mm->dumpable = suid_dumpable; + set_dumpable(current->mm, suid_dumpable); smp_wmb(); } current->fsgid = gid; @@ -1430,7 +1444,6 @@ asmlinkage long sys_times(struct tms __user * tbuf) * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. * LBT 04.03.94 */ - asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; @@ -1458,7 +1471,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->real_parent == group_leader) { + if (p->real_parent->tgid == group_leader->tgid) { err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out; @@ -2167,14 +2180,14 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = put_user(current->pdeath_signal, (int __user *)arg2); break; case PR_GET_DUMPABLE: - error = current->mm->dumpable; + error = get_dumpable(current->mm); break; case PR_SET_DUMPABLE: if (arg2 < 0 || arg2 > 1) { error = -EINVAL; break; } - current->mm->dumpable = arg2; + set_dumpable(current->mm, arg2); break; case PR_SET_UNALIGN: @@ -2286,3 +2299,61 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, } return err ? -EFAULT : 0; } + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(char **argv, char **envp) +{ + argv_free(argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setcleanup(info, argv_cleanup); + + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7063ebc6db0..c7314f95264 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include <linux/capability.h> #include <linux/ctype.h> #include <linux/utsname.h> -#include <linux/capability.h> #include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> @@ -46,6 +45,7 @@ #include <linux/syscalls.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> +#include <linux/reboot.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -77,6 +77,7 @@ extern int percpu_pagelist_fraction; extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; +extern int audit_argv_kb; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -159,6 +160,8 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +extern int prove_locking; +extern int lock_stat; /* The default sysctl tables: */ @@ -219,8 +222,19 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -290,6 +304,34 @@ static ctl_table kern_table[] = { }, #endif { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_compat_yield", + .data = &sysctl_sched_compat_yield, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_PROVE_LOCKING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .ctl_name = CTL_UNNUMBERED, + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, @@ -305,6 +347,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_AUDITSYSCALL + { + .ctl_name = CTL_UNNUMBERED, + .procname = "audit_argv_kb", + .data = &audit_argv_kb, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = KERN_CORE_PATTERN, .procname = "core_pattern", @@ -655,11 +707,11 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_ACPI_SLEEP +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) { .ctl_name = KERN_ACPI_VIDEO_FLAGS, .procname = "acpi_video_flags", - .data = &acpi_video_flags, + .data = &acpi_realmode_flags, .maxlen = sizeof (unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, @@ -705,13 +757,26 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif - + { + .ctl_name = CTL_UNNUMBERED, + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, +/* + * NOTE: do not add new entries to this table unless you have read + * Documentation/sysctl/ctl_unnumbered.txt + */ { .ctl_name = 0 } }; /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int two = 2; static int one_hundred = 100; @@ -976,6 +1041,7 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif #ifdef CONFIG_NUMA { .ctl_name = CTL_UNNUMBERED, @@ -987,7 +1053,6 @@ static ctl_table vm_table[] = { .strategy = &sysctl_string, }, #endif -#endif #if defined(CONFIG_X86_32) || \ (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { @@ -1102,7 +1167,10 @@ static ctl_table fs_table[] = { .data = &lease_break_time, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &two, }, { .ctl_name = FS_AIO_NR, @@ -1153,6 +1221,16 @@ static ctl_table fs_table[] = { }; static ctl_table debug_table[] = { +#if defined(CONFIG_X86) || defined(CONFIG_PPC) + { + .ctl_name = CTL_UNNUMBERED, + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; diff --git a/kernel/time.c b/kernel/time.c index ffe19149d77..2289a8d6831 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -57,17 +57,14 @@ EXPORT_SYMBOL(sys_tz); */ asmlinkage long sys_time(time_t __user * tloc) { - /* - * We read xtime.tv_sec atomically - it's updated - * atomically by update_wall_time(), so no need to - * even read-lock the xtime seqlock: - */ - time_t i = xtime.tv_sec; + time_t i; + struct timespec tv; - smp_rmb(); /* sys_time() results are coherent */ + getnstimeofday(&tv); + i = tv.tv_sec; if (tloc) { - if (put_user(i, tloc)) + if (put_user(i,tloc)) i = -EFAULT; } return i; @@ -136,7 +133,6 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; - time_interpolator_reset(); write_sequnlock_irq(&xtime_lock); clock_was_set(); } @@ -219,22 +215,6 @@ asmlinkage long sys_adjtimex(struct timex __user *txc_p) return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } -inline struct timespec current_kernel_time(void) -{ - struct timespec now; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - - now = xtime; - } while (read_seqretry(&xtime_lock, seq)); - - return now; -} - -EXPORT_SYMBOL(current_kernel_time); - /** * current_fs_time - Return FS time * @sb: Superblock. @@ -309,92 +289,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifdef CONFIG_TIME_INTERPOLATION -void getnstimeofday (struct timespec *tv) -{ - unsigned long seq,sec,nsec; - - do { - seq = read_seqbegin(&xtime_lock); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec+time_interpolator_get_offset(); - } while (unlikely(read_seqretry(&xtime_lock, seq))); - - while (unlikely(nsec >= NSEC_PER_SEC)) { - nsec -= NSEC_PER_SEC; - ++sec; - } - tv->tv_sec = sec; - tv->tv_nsec = nsec; -} -EXPORT_SYMBOL_GPL(getnstimeofday); - -int do_settimeofday (struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - { - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - time_interpolator_reset(); - } - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} -EXPORT_SYMBOL(do_settimeofday); - -void do_gettimeofday (struct timeval *tv) -{ - unsigned long seq, nsec, usec, sec, offset; - do { - seq = read_seqbegin(&xtime_lock); - offset = time_interpolator_get_offset(); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec; - } while (unlikely(read_seqretry(&xtime_lock, seq))); - - usec = (nsec + offset) / 1000; - - while (unlikely(usec >= USEC_PER_SEC)) { - usec -= USEC_PER_SEC; - ++sec; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; - - /* - * Make sure xtime.tv_sec [returned by sys_time()] always - * follows the gettimeofday() result precisely. This - * condition is extremely unlikely, it can hit at most - * once per second: - */ - if (unlikely(xtime.tv_sec != tv->tv_sec)) { - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - update_wall_time(); - write_sequnlock_irqrestore(&xtime_lock, flags); - } -} -EXPORT_SYMBOL(do_gettimeofday); - -#else /* CONFIG_TIME_INTERPOLATION */ - #ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval @@ -410,7 +304,6 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif -#endif /* CONFIG_TIME_INTERPOLATION */ /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f6635112654..8d53106a0a9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -23,3 +23,8 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 99b6034fc86..905b0b50792 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,6 +1,6 @@ obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2ad1c37b8df..822beebe664 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -113,16 +113,6 @@ int clockevents_register_notifier(struct notifier_block *nb) return ret; } -/** - * clockevents_unregister_notifier - unregister a clock events change listener - */ -void clockevents_unregister_notifier(struct notifier_block *nb) -{ - spin_lock(&clockevents_lock); - raw_notifier_chain_unregister(&clockevents_chain, nb); - spin_unlock(&clockevents_lock); -} - /* * Notify about a clock event change. Called with clockevents_lock * held. @@ -204,6 +194,7 @@ void clockevents_exchange_device(struct clock_event_device *old, local_irq_restore(flags); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events */ @@ -232,4 +223,4 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - +#endif diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 438c6b723ee..de6a2d6b3eb 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/time.h> +#include <linux/timer.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/hrtimer.h> @@ -116,11 +117,6 @@ void second_overflow(void) if (xtime.tv_sec % 86400 == 0) { xtime.tv_sec--; wall_to_monotonic.tv_sec++; - /* - * The timer interpolator will make time change - * gradually instead of an immediate jump by one second - */ - time_interpolator_update(-NSEC_PER_SEC); time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second " "23:59:60 UTC\n"); @@ -130,11 +126,6 @@ void second_overflow(void) if ((xtime.tv_sec + 1) % 86400 == 0) { xtime.tv_sec++; wall_to_monotonic.tv_sec--; - /* - * Use of time interpolator for a gradual change of - * time - */ - time_interpolator_update(NSEC_PER_SEC); time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second " "23:59:59 UTC\n"); @@ -185,12 +176,64 @@ u64 current_tick_length(void) return tick_length; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE -void __attribute__ ((weak)) notify_arch_cmos_timer(void) +/* Disable the cmos update - used by virtualization and embedded */ +int no_sync_cmos_clock __read_mostly; + +static void sync_cmos_clock(unsigned long dummy); + +static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timespec now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + getnstimeofday(&now); + if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + fail = update_persistent_clock(now); + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); +} + +static void notify_cmos_timer(void) { - return; + if (!no_sync_cmos_clock) + mod_timer(&sync_cmos_timer, jiffies + 1); } +#else +static inline void notify_cmos_timer(void) { } +#endif + /* adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ @@ -355,6 +398,6 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); do_gettimeofday(&txc->time); - notify_arch_cmos_timer(); + notify_cmos_timer(); return(result); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8001d37071f..298bc7c6f09 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -31,6 +31,12 @@ struct tick_device tick_broadcast_device; static cpumask_t tick_broadcast_mask; static DEFINE_SPINLOCK(tick_broadcast_lock); +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_clear_oneshot(int cpu); +#else +static inline void tick_broadcast_clear_oneshot(int cpu) { } +#endif + /* * Debugging: see timer_list.c */ @@ -49,7 +55,7 @@ cpumask_t *tick_get_broadcast_mask(void) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc && bc->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (bc) tick_setup_periodic(bc, 1); } @@ -58,8 +64,9 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if (tick_broadcast_device.evtdev || - (dev->features & CLOCK_EVT_FEAT_C3STOP)) + if ((tick_broadcast_device.evtdev && + tick_broadcast_device.evtdev->rating >= dev->rating) || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; clockevents_exchange_device(NULL, dev); @@ -99,8 +106,19 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) cpu_set(cpu, tick_broadcast_mask); tick_broadcast_start_periodic(tick_broadcast_device.evtdev); ret = 1; - } + } else { + /* + * When the new device is not affected by the stop + * feature and the cpu is marked in the broadcast mask + * then clear the broadcast bit. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { + int cpu = smp_processor_id(); + cpu_clear(cpu, tick_broadcast_mask); + tick_broadcast_clear_oneshot(cpu); + } + } spin_unlock_irqrestore(&tick_broadcast_lock, flags); return ret; } @@ -159,8 +177,6 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - dev->next_event.tv64 = KTIME_MAX; - tick_do_periodic_broadcast(); /* @@ -299,7 +315,7 @@ void tick_suspend_broadcast(void) spin_lock_irqsave(&tick_broadcast_lock, flags); bc = tick_broadcast_device.evtdev; - if (bc && tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (bc) clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -316,6 +332,8 @@ int tick_resume_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: if(!cpus_empty(tick_broadcast_mask)) @@ -364,11 +382,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - - if(!cpus_empty(tick_broadcast_oneshot_mask)) - tick_broadcast_set_event(ktime_get(), 1); - - return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask); + return 0; } /* @@ -485,16 +499,24 @@ out: spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpu_clear(cpu, tick_broadcast_oneshot_mask); +} + /** * tick_broadcast_setup_highres - setup the broadcast device for highres */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; - } + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; } /* @@ -520,20 +542,17 @@ void tick_broadcast_switch_to_oneshot(void) */ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { - struct clock_event_device *bc; unsigned long flags; unsigned int cpu = *cpup; spin_lock_irqsave(&tick_broadcast_lock, flags); - bc = tick_broadcast_device.evtdev; + /* + * Clear the broadcast mask flag for the dead cpu, but do not + * stop the broadcast device! + */ cpu_clear(cpu, tick_broadcast_oneshot_mask); - if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) { - if (bc && cpus_empty(tick_broadcast_oneshot_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - } - spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a96ec9ab345..3f3ae390783 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -200,7 +200,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) cpu = smp_processor_id(); if (!cpu_isset(cpu, newdev->cpumask)) - goto out; + goto out_bc; td = &per_cpu(tick_cpu_device, cpu); curdev = td->evtdev; @@ -265,7 +265,7 @@ out_bc: */ if (tick_check_broadcast_device(newdev)) ret = NOTIFY_STOP; -out: + spin_unlock_irqrestore(&tick_device_lock, flags); return ret; @@ -318,12 +318,17 @@ static void tick_resume(void) { struct tick_device *td = &__get_cpu_var(tick_cpu_device); unsigned long flags; + int broadcast = tick_resume_broadcast(); spin_lock_irqsave(&tick_device_lock, flags); - if (td->mode == TICKDEV_MODE_PERIODIC) - tick_setup_periodic(td->evtdev, 0); - else - tick_resume_oneshot(); + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -360,8 +365,7 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason, break; case CLOCK_EVT_NOTIFY_RESUME: - if (!tick_resume_broadcast()) - tick_resume(); + tick_resume(); break; default: diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index f6997ab0c3c..0258d3115d5 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -73,8 +73,21 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) struct clock_event_device *dev = td->evtdev; if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || - !tick_device_is_functional(dev)) + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else + printk(" %s does not support one-shot mode.\n", + dev->name); + } return -EINVAL; + } td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 52db9e3c526..8c3fef1db09 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -160,6 +160,18 @@ void tick_nohz_stop_sched_tick(void) cpu = smp_processor_id(); ts = &per_cpu(tick_cpu_sched, cpu); + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + } + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; @@ -546,6 +558,7 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); + u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -554,8 +567,12 @@ void tick_setup_sched_timer(void) ts->sched_timer.function = tick_sched_timer; ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; - /* Get the next period */ + /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, NR_CPUS); + offset *= smp_processor_id(); + ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 728cedfd3cb..4ad79f6bdec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -47,10 +47,22 @@ EXPORT_SYMBOL(xtime_lock); struct timespec xtime __attribute__ ((aligned (16))); struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ - EXPORT_SYMBOL(xtime); +#ifdef CONFIG_NO_HZ +static struct timespec xtime_cache __attribute__ ((aligned (16))); +static inline void update_xtime_cache(u64 nsec) +{ + xtime_cache = xtime; + timespec_add_ns(&xtime_cache, nsec); +} +#else +#define xtime_cache xtime +/* We do *not* want to evaluate the argument for this case */ +#define update_xtime_cache(n) do { } while (0) +#endif + static struct clocksource *clock; /* pointer to current clocksource */ @@ -205,6 +217,7 @@ static void change_clocksource(void) } #else static inline void change_clocksource(void) { } +static inline s64 __get_nsec_offset(void) { return 0; } #endif /** @@ -268,6 +281,8 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; +/* xtime offset when we went into suspend */ +static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -293,6 +308,8 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } + /* Make sure that we have the correct xtime reference */ + timespec_add_ns(&xtime, timekeeping_suspend_nsecs); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; @@ -313,9 +330,12 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + timekeeping_suspend_time = read_persistent_clock(); + write_seqlock_irqsave(&xtime_lock, flags); + /* Get the current xtime offset */ + timekeeping_suspend_nsecs = __get_nsec_offset(); timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); write_sequnlock_irqrestore(&xtime_lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -401,7 +421,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * this is optimized for the most common adjustments of -1,0,1, * for other values we can do a bit more work. */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) +static void clocksource_adjust(s64 offset) { s64 error, interval = clock->cycle_interval; int adj; @@ -466,22 +486,20 @@ void update_wall_time(void) second_overflow(); } - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - /* accumulate error between NTP and clock interval */ clock->error += current_tick_length(); clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); } /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); + clocksource_adjust(offset); /* store full nanoseconds into xtime */ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + update_xtime_cache(cyc2ns(clock, offset)); + /* check to see if there is a new clocksource to use */ change_clocksource(); update_vsyscall(&xtime, clock); @@ -513,3 +531,25 @@ void monotonic_to_bootbased(struct timespec *ts) { ts->tv_sec += total_sleep_time; } + +unsigned long get_seconds(void) +{ + return xtime_cache.tv_sec; +} +EXPORT_SYMBOL(get_seconds); + + +struct timespec current_kernel_time(void) +{ + struct timespec now; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime_cache; + } while (read_seqretry(&xtime_lock, seq)); + + return now; +} +EXPORT_SYMBOL(current_kernel_time); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index e5edc3a22a0..fdb2e03d4fe 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -267,7 +267,7 @@ static struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; static int __init init_timer_list_procfs(void) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 8ed62fda16c..c36bb7ed030 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -327,8 +327,9 @@ static int tstats_show(struct seq_file *m, void *v) ms = 1; if (events && period.tv_sec) - seq_printf(m, "%ld total events, %ld.%ld events/sec\n", events, - events / period.tv_sec, events * 1000 / ms); + seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", + events, events * 1000 / ms, + (events * 1000000 / ms) % 1000); else seq_printf(m, "%ld total events\n", events); @@ -399,7 +400,7 @@ static struct file_operations tstats_fops = { .read = seq_read, .write = tstats_write, .llseek = seq_lseek, - .release = seq_release, + .release = single_release, }; void __init init_timer_stats(void) diff --git a/kernel/timer.c b/kernel/timer.c index b7792fb0338..6ce1952eea7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -103,14 +103,14 @@ static inline tvec_base_t *tbase_get_base(tvec_base_t *base) static inline void timer_set_deferrable(struct timer_list *timer) { timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | - TBASE_DEFERRABLE_FLAG)); + TBASE_DEFERRABLE_FLAG)); } static inline void timer_set_base(struct timer_list *timer, tvec_base_t *new_base) { timer->base = (tvec_base_t *)((unsigned long)(new_base) | - tbase_get_deferrable(timer->base)); + tbase_get_deferrable(timer->base)); } /** @@ -445,10 +445,10 @@ EXPORT_SYMBOL(__mod_timer); void add_timer_on(struct timer_list *timer, int cpu) { tvec_base_t *base = per_cpu(tvec_bases, cpu); - unsigned long flags; + unsigned long flags; timer_stats_timer_set_start_info(timer); - BUG_ON(timer_pending(timer) || !timer->function); + BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); @@ -627,7 +627,7 @@ static inline void __run_timers(tvec_base_t *base) while (time_after_eq(jiffies, base->timer_jiffies)) { struct list_head work_list; struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + int index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -644,8 +644,8 @@ static inline void __run_timers(tvec_base_t *base) unsigned long data; timer = list_first_entry(head, struct timer_list,entry); - fn = timer->function; - data = timer->data; + fn = timer->function; + data = timer->data; timer_stats_account_timer(timer); @@ -689,8 +689,8 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) - continue; + if (tbase_get_deferrable(nte->base)) + continue; found = 1; expires = nte->expires; @@ -834,7 +834,7 @@ void update_process_times(int user_tick) if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); - run_posix_cpu_timers(p); + run_posix_cpu_timers(p); } /* @@ -909,7 +909,7 @@ static inline void update_times(unsigned long ticks) update_wall_time(); calc_load(ticks); } - + /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1105,7 +1105,7 @@ asmlinkage long sys_gettid(void) /** * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill - */ + */ int do_sysinfo(struct sysinfo *info) { unsigned long mem_total, sav_total; @@ -1349,194 +1349,6 @@ void __init init_timers(void) open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); } -#ifdef CONFIG_TIME_INTERPOLATION - -struct time_interpolator *time_interpolator __read_mostly; -static struct time_interpolator *time_interpolator_list __read_mostly; -static DEFINE_SPINLOCK(time_interpolator_lock); - -static inline cycles_t time_interpolator_get_cycles(unsigned int src) -{ - unsigned long (*x)(void); - - switch (src) - { - case TIME_SOURCE_FUNCTION: - x = time_interpolator->addr; - return x(); - - case TIME_SOURCE_MMIO64 : - return readq_relaxed((void __iomem *)time_interpolator->addr); - - case TIME_SOURCE_MMIO32 : - return readl_relaxed((void __iomem *)time_interpolator->addr); - - default: return get_cycles(); - } -} - -static inline u64 time_interpolator_get_counter(int writelock) -{ - unsigned int src = time_interpolator->source; - - if (time_interpolator->jitter) - { - cycles_t lcycle; - cycles_t now; - - do { - lcycle = time_interpolator->last_cycle; - now = time_interpolator_get_cycles(src); - if (lcycle && time_after(lcycle, now)) - return lcycle; - - /* When holding the xtime write lock, there's no need - * to add the overhead of the cmpxchg. Readers are - * force to retry until the write lock is released. - */ - if (writelock) { - time_interpolator->last_cycle = now; - return now; - } - /* Keep track of the last timer value returned. The use of cmpxchg here - * will cause contention in an SMP environment. - */ - } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle)); - return now; - } - else - return time_interpolator_get_cycles(src); -} - -void time_interpolator_reset(void) -{ - time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(1); -} - -#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) - -unsigned long time_interpolator_get_offset(void) -{ - /* If we do not have a time interpolator set up then just return zero */ - if (!time_interpolator) - return 0; - - return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); -} - -#define INTERPOLATOR_ADJUST 65536 -#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST - -void time_interpolator_update(long delta_nsec) -{ - u64 counter; - unsigned long offset; - - /* If there is no time interpolator set up then do nothing */ - if (!time_interpolator) - return; - - /* - * The interpolator compensates for late ticks by accumulating the late - * time in time_interpolator->offset. A tick earlier than expected will - * lead to a reset of the offset and a corresponding jump of the clock - * forward. Again this only works if the interpolator clock is running - * slightly slower than the regular clock and the tuning logic insures - * that. - */ - - counter = time_interpolator_get_counter(1); - offset = time_interpolator->offset + - GET_TI_NSECS(counter, time_interpolator); - - if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) - time_interpolator->offset = offset - delta_nsec; - else { - time_interpolator->skips++; - time_interpolator->ns_skipped += delta_nsec - offset; - time_interpolator->offset = 0; - } - time_interpolator->last_counter = counter; - - /* Tuning logic for time interpolator invoked every minute or so. - * Decrease interpolator clock speed if no skips occurred and an offset is carried. - * Increase interpolator clock speed if we skip too much time. - */ - if (jiffies % INTERPOLATOR_ADJUST == 0) - { - if (time_interpolator->skips == 0 && time_interpolator->offset > tick_nsec) - time_interpolator->nsec_per_cyc--; - if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0) - time_interpolator->nsec_per_cyc++; - time_interpolator->skips = 0; - time_interpolator->ns_skipped = 0; - } -} - -static inline int -is_better_time_interpolator(struct time_interpolator *new) -{ - if (!time_interpolator) - return 1; - return new->frequency > 2*time_interpolator->frequency || - (unsigned long)new->drift < (unsigned long)time_interpolator->drift; -} - -void -register_time_interpolator(struct time_interpolator *ti) -{ - unsigned long flags; - - /* Sanity check */ - BUG_ON(ti->frequency == 0 || ti->mask == 0); - - ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency; - spin_lock(&time_interpolator_lock); - write_seqlock_irqsave(&xtime_lock, flags); - if (is_better_time_interpolator(ti)) { - time_interpolator = ti; - time_interpolator_reset(); - } - write_sequnlock_irqrestore(&xtime_lock, flags); - - ti->next = time_interpolator_list; - time_interpolator_list = ti; - spin_unlock(&time_interpolator_lock); -} - -void -unregister_time_interpolator(struct time_interpolator *ti) -{ - struct time_interpolator *curr, **prev; - unsigned long flags; - - spin_lock(&time_interpolator_lock); - prev = &time_interpolator_list; - for (curr = *prev; curr; curr = curr->next) { - if (curr == ti) { - *prev = curr->next; - break; - } - prev = &curr->next; - } - - write_seqlock_irqsave(&xtime_lock, flags); - if (ti == time_interpolator) { - /* we lost the best time-interpolator: */ - time_interpolator = NULL; - /* find the next-best interpolator */ - for (curr = time_interpolator_list; curr; curr = curr->next) - if (is_better_time_interpolator(curr)) - time_interpolator = curr; - time_interpolator_reset(); - } - write_sequnlock_irqrestore(&xtime_lock, flags); - spin_unlock(&time_interpolator_lock); -} -#endif /* CONFIG_TIME_INTERPOLATION */ - /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Time in milliseconds to sleep for diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 658f638c402..c122131a122 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -39,7 +39,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) ac_etime = timespec_to_ns(&ts); do_div(ac_etime, NSEC_PER_USEC); stats->ac_etime = ac_etime; - stats->ac_btime = xtime.tv_sec - ts.tv_sec; + stats->ac_btime = get_seconds() - ts.tv_sec; if (thread_group_leader(tsk)) { stats->ac_exitcode = tsk->exit_code; if (tsk->flags & PF_FORKNOEXEC) diff --git a/kernel/user.c b/kernel/user.c index 98b82507797..9ca2848fc35 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,25 +55,22 @@ struct user_struct root_user = { /* * These routines must be called with the uidhash spinlock held! */ -static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent) +static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) { - list_add(&up->uidhash_list, hashent); + hlist_add_head(&up->uidhash_node, hashent); } static inline void uid_hash_remove(struct user_struct *up) { - list_del(&up->uidhash_list); + hlist_del_init(&up->uidhash_node); } -static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent) +static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) { - struct list_head *up; - - list_for_each(up, hashent) { - struct user_struct *user; - - user = list_entry(up, struct user_struct, uidhash_list); + struct user_struct *user; + struct hlist_node *h; + hlist_for_each_entry(user, h, hashent, uidhash_node) { if(user->uid == uid) { atomic_inc(&user->__count); return user; @@ -122,7 +119,7 @@ void free_uid(struct user_struct *up) struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) { - struct list_head *hashent = uidhashentry(ns, uid); + struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; spin_lock_irq(&uidhash_lock); @@ -202,16 +199,40 @@ void switch_uid(struct user_struct *new_user) suid_keys(current); } +void release_uids(struct user_namespace *ns) +{ + int i; + unsigned long flags; + struct hlist_head *head; + struct hlist_node *nd; + + spin_lock_irqsave(&uidhash_lock, flags); + /* + * collapse the chains so that the user_struct-s will + * be still alive, but not in hashes. subsequent free_uid() + * will free them. + */ + for (i = 0; i < UIDHASH_SZ; i++) { + head = ns->uidhash_table + i; + while (!hlist_empty(head)) { + nd = head->first; + hlist_del_init(nd); + } + } + spin_unlock_irqrestore(&uidhash_lock, flags); + + free_uid(ns->root_user); +} static int __init uid_cache_init(void) { int n; uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); for(n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(init_user_ns.uidhash_table + n); + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); /* Insert the root user immediately (init already runs as root) */ spin_lock_irq(&uidhash_lock); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d055d987850..7af90fc4f0f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) kref_init(&ns->kref); for (n = 0; n < UIDHASH_SZ; ++n) - INIT_LIST_HEAD(ns->uidhash_table + n); + INIT_HLIST_HEAD(ns->uidhash_table + n); /* Insert new root user. */ ns->root_user = alloc_uid(ns, 0); @@ -81,6 +81,7 @@ void free_user_ns(struct kref *kref) struct user_namespace *ns; ns = container_of(kref, struct user_namespace, kref); + release_uids(ns); kfree(ns); } diff --git a/kernel/utsname.c b/kernel/utsname.c index 9d8180a0f0d..816d7b24fa0 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -28,7 +28,9 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) if (!ns) return ERR_PTR(-ENOMEM); + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + up_read(&uts_sem); kref_init(&ns->kref); return ns; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6b..e080d1d744c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -635,7 +635,7 @@ int keventd_up(void) int current_is_keventd(void) { struct cpu_workqueue_struct *cwq; - int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ int ret = 0; BUG_ON(!keventd_wq); |