diff options
Diffstat (limited to 'kernel')
57 files changed, 2235 insertions, 1391 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ac6b27abb1a..642d4277c2e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/audit.c b/kernel/audit.c index 4e9d2082968..d13276d4141 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) err = -EPERM; break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; @@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_USER: - case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f382b0f775e..88b416dfbc7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -42,7 +42,6 @@ #include <linux/seq_file.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/stat.h> #include <linux/string.h> @@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = cpulist_parse(buf, trialcs.cpus_allowed); - if (retval < 0) - return retval; + + /* + * We allow a cpuset's cpus_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + cpus_clear(trialcs.cpus_allowed); + } else { + retval = cpulist_parse(buf, trialcs.cpus_allowed); + if (retval < 0) + return retval; + } cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); - if (cpus_empty(trialcs.cpus_allowed)) + /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval < 0) @@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf) return -EACCES; trialcs = *cs; - retval = nodelist_parse(buf, trialcs.mems_allowed); - if (retval < 0) - goto done; + + /* + * We allow a cpuset's mems_allowed to be empty; if it has attached + * tasks, we'll catch it later when we validate the change and return + * -ENOSPC. + */ + if (!buf[0] || (buf[0] == '\n' && !buf[1])) { + nodes_clear(trialcs.mems_allowed); + } else { + retval = nodelist_parse(buf, trialcs.mems_allowed); + if (retval < 0) + goto done; + } nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } - if (nodes_empty(trialcs.mems_allowed)) { + /* mems_allowed cannot be empty for a cpuset with attached tasks. */ + if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { retval = -ENOSPC; goto done; } @@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child) * it is holding that mutex while calling check_for_release(), * which calls kmalloc(), so can't be called holding callback_mutex(). * - * We don't need to task_lock() this reference to tsk->cpuset, - * because tsk is already marked PF_EXITING, so attach_task() won't - * mess with it, or task is a failed fork, never visible to attach_task. - * * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). @@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; + task_lock(current); cs = tsk->cpuset; tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ + task_unlock(current); if (notify_on_release(cs)) { char *pathbuf = NULL; @@ -2351,6 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * If the task has been OOM killed and has access to memory reserves + * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. * * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() @@ -2368,7 +2389,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * calls get to this routine, we should just shut up and say 'yes'. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, - * and do not allow allocations outside the current tasks cpuset. + * and do not allow allocations outside the current tasks cpuset + * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing mem_exclusive ancestor cpuset. * @@ -2392,6 +2414,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok + * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. * @@ -2413,6 +2436,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return 0; @@ -2438,7 +2467,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) * * If we're in interrupt, yes, we can always allocate. * If __GFP_THISNODE is set, yes, we can always allocate. If zone - * z's node is in our tasks mems_allowed, yes. Otherwise, no. + * z's node is in our tasks mems_allowed, yes. If the task has been + * OOM killed and has access to memory reserves as specified by the + * TIF_MEMDIE flag, yes. Otherwise, no. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by @@ -2462,6 +2493,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) node = zone_to_nid(z); if (node_isset(node, current->mems_allowed)) return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; return 0; } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 766d5912b26..c0148ae992c 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = kmem_cache_create("delayacct_cache", - sizeof(struct task_delay_info), - 0, - SLAB_PANIC, - NULL, NULL); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); delayacct_tsk_init(&init_task); } diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c new file mode 100644 index 00000000000..0d98827887a --- /dev/null +++ b/kernel/die_notifier.c @@ -0,0 +1,38 @@ + +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> +#include <linux/kdebug.h> + + +static ATOMIC_NOTIFIER_HEAD(die_chain); + +int notify_die(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + return atomic_notifier_call_chain(&die_chain, val, &args); +} + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_die_notifier); + + diff --git a/kernel/exit.c b/kernel/exit.c index b55ed4cc910..f5a7abb621f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -7,7 +7,6 @@ #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> @@ -1033,6 +1032,8 @@ asmlinkage void sys_exit_group(int error_code) static int eligible_child(pid_t pid, int options, struct task_struct *p) { + int err; + if (pid > 0) { if (p->pid != pid) return 0; @@ -1066,8 +1067,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p) if (delay_group_leader(p)) return 2; - if (security_task_wait(p)) - return 0; + err = security_task_wait(p); + if (err) + return err; return 1; } @@ -1449,6 +1451,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop, DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int flag, retval; + int allowed, denied; add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: @@ -1457,6 +1460,7 @@ repeat: * match our criteria, even if we are not able to reap it yet. */ flag = 0; + allowed = denied = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); tsk = current; @@ -1472,6 +1476,12 @@ repeat: if (!ret) continue; + if (unlikely(ret < 0)) { + denied = ret; + continue; + } + allowed = 1; + switch (p->state) { case TASK_TRACED: /* @@ -1570,6 +1580,8 @@ check_continued: goto repeat; } retval = -ECHILD; + if (unlikely(denied) && !allowed) + retval = denied; end: current->state = TASK_RUNNING; remove_wait_queue(¤t->signal->wait_chldexit,&wait); diff --git a/kernel/fork.c b/kernel/fork.c index 6af959c034d..a8dd75d4992 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -14,7 +14,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/unistd.h> -#include <linux/smp_lock.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> @@ -286,6 +285,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); retval = 0; out: up_write(&mm->mmap_sem); @@ -1423,8 +1424,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl { struct sighand_struct *sighand = data; - if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) + if (flags & SLAB_CTOR_CONSTRUCTOR) spin_lock_init(&sighand->siglock); } @@ -1515,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unshare the mnt_namespace structure if it is being shared - */ -static int unshare_mnt_namespace(unsigned long unshare_flags, - struct mnt_namespace **new_nsp, struct fs_struct *new_fs) -{ - struct mnt_namespace *ns = current->nsproxy->mnt_ns; - - if ((unshare_flags & CLONE_NEWNS) && ns) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); - if (!*new_nsp) - return -ENOMEM; - } - - return 0; -} - -/* * Unsharing of sighand is not supported yet */ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) @@ -1592,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } -#ifndef CONFIG_IPC_NS -static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) -{ - if (flags & CLONE_NEWIPC) - return -EINVAL; - - return 0; -} -#endif - /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1614,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) { int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct mnt_namespace *ns, *new_ns = NULL; struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; - struct uts_namespace *uts, *new_uts = NULL; - struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); @@ -1636,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) - goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_ns; + goto bad_unshare_cleanup_fs; if ((err = unshare_vm(unshare_flags, &new_mm))) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; - if ((err = unshare_utsname(unshare_flags, &new_uts))) + if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_fs))) goto bad_unshare_cleanup_semundo; - if ((err = unshare_ipcs(unshare_flags, &new_ipc))) - goto bad_unshare_cleanup_uts; - - if (new_ns || new_uts || new_ipc) { - old_nsproxy = current->nsproxy; - new_nsproxy = dup_namespaces(old_nsproxy); - if (!new_nsproxy) { - err = -ENOMEM; - goto bad_unshare_cleanup_ipc; - } - } - if (new_fs || new_ns || new_mm || new_fd || new_ulist || - new_uts || new_ipc) { + if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { task_lock(current); if (new_nsproxy) { + old_nsproxy = current->nsproxy; current->nsproxy = new_nsproxy; new_nsproxy = old_nsproxy; } @@ -1676,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fs = fs; } - if (new_ns) { - ns = current->nsproxy->mnt_ns; - current->nsproxy->mnt_ns = new_ns; - new_ns = ns; - } - if (new_mm) { mm = current->mm; active_mm = current->active_mm; @@ -1697,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } - if (new_uts) { - uts = current->nsproxy->uts_ns; - current->nsproxy->uts_ns = new_uts; - new_uts = uts; - } - - if (new_ipc) { - ipc = current->nsproxy->ipc_ns; - current->nsproxy->ipc_ns = new_ipc; - new_ipc = ipc; - } - task_unlock(current); } if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_ipc: - if (new_ipc) - put_ipc_ns(new_ipc); - -bad_unshare_cleanup_uts: - if (new_uts) - put_uts_ns(new_uts); - bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) @@ -1737,10 +1666,6 @@ bad_unshare_cleanup_sigh: if (atomic_dec_and_test(&new_sigh->count)) kmem_cache_free(sighand_cachep, new_sigh); -bad_unshare_cleanup_ns: - if (new_ns) - put_mnt_ns(new_ns); - bad_unshare_cleanup_fs: if (new_fs) put_fs_struct(new_fs); diff --git a/kernel/futex.c b/kernel/futex.c index 5a270b5e3f9..600bc9d801f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -48,6 +48,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <linux/module.h> #include <asm/futex.h> #include "rtmutex_common.h" @@ -55,32 +56,6 @@ #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* - * Futexes are matched on equal values of this key. - * The key type depends on whether it's a shared or private mapping. - * Don't rearrange members without looking at hash_futex(). - * - * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. - * We set bit 0 to indicate if it's an inode-based key. - */ -union futex_key { - struct { - unsigned long pgoff; - struct inode *inode; - int offset; - } shared; - struct { - unsigned long address; - struct mm_struct *mm; - int offset; - } private; - struct { - unsigned long word; - void *ptr; - int offset; - } both; -}; - -/* * Priority Inheritance state: */ struct futex_pi_state { @@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(u32 __user *uaddr, union futex_key *key) +int get_futex_key(u32 __user *uaddr, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) } return err; } +EXPORT_SYMBOL_GPL(get_futex_key); /* * Take a reference to the resource addressed by a key. @@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) * NOTE: mmap_sem MUST be held between get_futex_key() and calling this * function, if it is called at all. mmap_sem keeps key->shared.inode valid. */ -static inline void get_key_refs(union futex_key *key) +inline void get_futex_key_refs(union futex_key *key) { if (key->both.ptr != 0) { if (key->both.offset & 1) @@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key) atomic_inc(&key->private.mm->mm_count); } } +EXPORT_SYMBOL_GPL(get_futex_key_refs); /* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. */ -static void drop_key_refs(union futex_key *key) +void drop_futex_key_refs(union futex_key *key) { if (key->both.ptr != 0) { if (key->both.offset & 1) @@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key) mmdrop(key->private.mm); } } +EXPORT_SYMBOL_GPL(drop_futex_key_refs); static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { @@ -873,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, this->lock_ptr = &hb2->lock; } this->key = key2; - get_key_refs(&key2); + get_futex_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) @@ -886,9 +864,9 @@ out_unlock: if (hb1 != hb2) spin_unlock(&hb2->lock); - /* drop_key_refs() must be called outside the spinlocks. */ + /* drop_futex_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) - drop_key_refs(&key1); + drop_futex_key_refs(&key1); out: up_read(¤t->mm->mmap_sem); @@ -906,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); - get_key_refs(&q->key); + get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -925,7 +903,7 @@ static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { spin_unlock(&hb->lock); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } /* @@ -980,7 +958,7 @@ static int unqueue_me(struct futex_q *q) ret = 1; } - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); return ret; } @@ -999,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) spin_unlock(&hb->lock); - drop_key_refs(&q->key); + drop_futex_key_refs(&q->key); } -static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +static long futex_wait_restart(struct restart_block *restart); +static int futex_wait_abstime(u32 __user *uaddr, u32 val, + int timed, unsigned long abs_time) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; + unsigned long time_left = 0; u32 uval; int ret; @@ -1087,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) * !list_empty() is safe here without any lock. * q.lock_ptr != 0 is not safe, because of ordering against wakeup. */ - if (likely(!list_empty(&q.list))) - time = schedule_timeout(time); + time_left = 0; + if (likely(!list_empty(&q.list))) { + unsigned long rel_time; + + if (timed) { + unsigned long now = jiffies; + if (time_after(now, abs_time)) + rel_time = 0; + else + rel_time = abs_time - now; + } else + rel_time = MAX_SCHEDULE_TIMEOUT; + + time_left = schedule_timeout(rel_time); + } __set_current_state(TASK_RUNNING); /* @@ -1099,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) /* If we were woken (and unqueued), we succeeded, whatever. */ if (!unqueue_me(&q)) return 0; - if (time == 0) + if (time_left == 0) return -ETIMEDOUT; + /* * We expect signal_pending(current), but another thread may * have handled it for us already. */ - return -EINTR; + if (time_left == MAX_SCHEDULE_TIMEOUT) + return -ERESTARTSYS; + else { + struct restart_block *restart; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->arg0 = (unsigned long)uaddr; + restart->arg1 = (unsigned long)val; + restart->arg2 = (unsigned long)timed; + restart->arg3 = abs_time; + return -ERESTART_RESTARTBLOCK; + } out_unlock_release_sem: queue_unlock(&q, hb); @@ -1115,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) return ret; } +static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time) +{ + int timed = (rel_time != MAX_SCHEDULE_TIMEOUT); + return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time); +} + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->arg0; + u32 val = (u32)restart->arg1; + int timed = (int)restart->arg2; + unsigned long abs_time = restart->arg3; + + restart->fn = do_no_restart_syscall; + return (long)futex_wait_abstime(uaddr, val, timed, abs_time); +} + + /* * Userspace tried a 0 -> TID atomic transition of the futex value * and failed. The kernel side here does the whole locking operation: diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 1b3033105b4..c9f4f044a8a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -669,6 +669,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) return orun; } +EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0133f4f9e9f..615ce97c6cf 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -11,6 +11,7 @@ */ #include <linux/irq.h> +#include <linux/msi.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> @@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; + if (entry) + entry->irq = irq; spin_unlock_irqrestore(&desc->lock, flags); return 0; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index aff1f0fabb0..32e1ab1477d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .chip = &no_irq_chip, @@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq) if (desc->chip->ack) desc->chip->ack(irq); action_ret = handle_IRQ_event(irq, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); desc->chip->end(irq); return 1; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5597c157442..203a518b6f1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) } *p = new; -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif + /* Exclude IRQ from balancing */ if (new->flags & IRQF_NOBALANCING) desc->status |= IRQ_NO_BALANCING; @@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); +#if defined(CONFIG_IRQ_PER_CPU) + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; +#endif + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { if (desc->chip && desc->chip->set_type) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 2db91eb54ad..ddde0ef9ccd 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_desc + irq; struct irqaction *action; + unsigned long flags; + int ret = 1; - for (action = desc->action ; action; action = action->next) + spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action ; action; action = action->next) { if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) - return 0; - return 1; + !strcmp(new_action->name, action->name)) { + ret = 0; + break; + } + } + spin_unlock_irqrestore(&desc->lock, flags); + return ret; } void register_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 9d8c79b4882..b0d81aae472 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, if (unlikely(irqfixup)) { /* Don't punish working computers */ - if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { + if ((irqfixup == 2 && ((irq == 0) || + (desc->action->flags & IRQF_IRQPOLL))) || + action_ret == IRQ_NONE) { int ok = misrouted_irq(irq); if (action_ret == IRQ_NONE) desc->irqs_unhandled -= ok; diff --git a/kernel/itimer.c b/kernel/itimer.c index 307c6a632ef..3205e8e114f 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -7,7 +7,6 @@ /* These are all the functions necessary to implement itimers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/syscalls.h> #include <linux/time.h> @@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } /* - * We do not care about correctness. We just sanitize the values so - * the ktime_t operations which expect normalized values do not - * break. This converts negative values to long timeouts similar to - * the code in kernel versions < 2.6.16 - * - * Print a limited number of warning messages when an invalid timeval - * is detected. - */ -static void fixup_timeval(struct timeval *tv, int interval) -{ - static int warnlimit = 10; - unsigned long tmp; - - if (warnlimit > 0) { - warnlimit--; - printk(KERN_WARNING - "setitimer: %s (pid = %d) provided " - "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n", - current->comm, current->pid, - interval ? "it_interval" : "it_value", - tv->tv_sec, (long) tv->tv_usec); - } - - tmp = tv->tv_usec; - if (tmp >= USEC_PER_SEC) { - tv->tv_usec = tmp % USEC_PER_SEC; - tv->tv_sec += tmp / USEC_PER_SEC; - } - - tmp = tv->tv_sec; - if (tmp > LONG_MAX) - tv->tv_sec = LONG_MAX; -} - -/* * Returns true if the timeval is in canonical form */ #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -/* - * Check for invalid timevals, sanitize them and print a limited - * number of warnings. - */ -static void check_itimerval(struct itimerval *value) { - - if (unlikely(!timeval_valid(&value->it_value))) - fixup_timeval(&value->it_value, 0); - - if (unlikely(!timeval_valid(&value->it_interval))) - fixup_timeval(&value->it_interval, 1); -} - int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; @@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) /* * Validate the timevals in value. - * - * Note: Although the spec requires that invalid values shall - * return -EINVAL, we just fixup the value and print a limited - * number of warnings in order not to break users of this - * historical misfeature. - * - * Scheduled for replacement in March 2007 */ - check_itimerval(value); + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; switch (which) { case ITIMER_REAL: diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5a0de840973..f1bda23140b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr, symbol_end = (unsigned long)_etext; } - *symbolsize = symbol_end - symbol_start; - *offset = addr - symbol_start; + if (symbolsize) + *symbolsize = symbol_end - symbol_start; + if (offset) + *offset = addr - symbol_start; return low; } @@ -267,6 +269,42 @@ const char *kallsyms_lookup(unsigned long addr, return NULL; } +int lookup_symbol_name(unsigned long addr, char *symname) +{ + symname[0] = '\0'; + symname[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, NULL, NULL); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), symname); + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_name(addr, symname); +} + +int lookup_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + name[0] = '\0'; + name[KSYM_NAME_LEN] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, size, offset); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), name); + modname[0] = '\0'; + return 0; + } + /* see if it's in a module */ + return lookup_module_symbol_attrs(addr, size, offset, modname, name); +} + /* Look up a kernel symbol and return it in a text buffer. */ int sprint_symbol(char *buffer, unsigned long address) { @@ -301,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address) struct kallsym_iter { loff_t pos; - struct module *owner; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols */ char type; char name[KSYM_NAME_LEN+1]; + char module_name[MODULE_NAME_LEN + 1]; + int exported; }; static int get_ksymbol_mod(struct kallsym_iter *iter) { - iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, &iter->type, - iter->name, sizeof(iter->name)); - if (iter->owner == NULL) + if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, + &iter->type, iter->name, iter->module_name, + &iter->exported) < 0) return 0; - - /* Label it "global" if it is exported, "local" if not exported. */ - iter->type = is_exported(iter->name, iter->owner) - ? toupper(iter->type) : tolower(iter->type); - return 1; } @@ -328,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; - iter->owner = NULL; + iter->module_name[0] = '\0'; iter->value = kallsyms_addresses[iter->pos]; iter->type = kallsyms_get_symbol_type(off); @@ -392,12 +425,17 @@ static int s_show(struct seq_file *m, void *p) if (!iter->name[0]) return 0; - if (iter->owner) + if (iter->module_name[0]) { + char type; + + /* Label it "global" if it is exported, + * "local" if not exported. */ + type = iter->exported ? toupper(iter->type) : + tolower(iter->type); seq_printf(m, "%0*lx %c %s\t[%s]\n", (int)(2*sizeof(void*)), - iter->value, iter->type, iter->name, - module_name(iter->owner)); - else + iter->value, type, iter->name, iter->module_name); + } else seq_printf(m, "%0*lx %c %s\n", (int)(2*sizeof(void*)), iter->value, iter->type, iter->name); @@ -432,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return ret; } -static int kallsyms_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = (struct seq_file *)file->private_data; - kfree(m->private); - return seq_release(inode, file); -} - static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, - .release = kallsyms_release, + .release = seq_release_private, }; static int __init kallsyms_init(void) diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a59c8a01ae..25db14b89e8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); final_note(buf); } diff --git a/kernel/kmod.c b/kernel/kmod.c index 796276141e5..49cc4b9c1a8 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -23,7 +23,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/mnt_namespace.h> #include <linux/completion.h> @@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data) /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed(current, CPU_MASK_ALL); + /* + * Our parent is keventd, which runs with elevated scheduling priority. + * Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + retval = -EPERM; if (current->fs->root) retval = kernel_execve(sub_info->path, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d25a9ada3f8..9e47d8c493f 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -35,16 +35,19 @@ #include <linux/hash.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/stddef.h> #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/kallsyms.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/kdebug.h> + #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> -#include <asm/kdebug.h> +#include <asm/uaccess.h> #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) @@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobe_enabled; + DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) struct kprobe_insn_page *kip; struct hlist_node *pos; - retry: - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + retry: + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->nused < INSNS_PER_PAGE) { int i; for (i = 0; i < INSNS_PER_PAGE; i++) { @@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) } /* All out of space. Need to allocate a new page. Use slot 0. */ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); - if (!kip) { + if (!kip) return NULL; - } /* * Use module_alloc so this page is within +/- 2GB of where the @@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void) if (check_safety() != 0) return -EAGAIN; - hlist_for_each_safe(pos, next, &kprobe_insn_pages) { + hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { int i; - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ @@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) struct kprobe_insn_page *kip; struct hlist_node *pos; - hlist_for_each(pos, &kprobe_insn_pages) { - kip = hlist_entry(pos, struct kprobe_insn_page, hlist); + hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { if (kip->insns <= slot && slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { int i = (slot - kip->insns) / MAX_INSN_SIZE; @@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) break; } } - if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { + + if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) collect_garbage_slots(); - } } #endif @@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, reset_kprobe_instance(); } } - return; } static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, @@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) } /* Called with kretprobe_lock held */ -struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->free_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe - *rp) -{ - struct hlist_node *node; - struct kretprobe_instance *ri; - hlist_for_each_entry(ri, node, &rp->used_instances, uflist) - return ri; - return NULL; -} - -/* Called with kretprobe_lock held */ -void __kprobes add_rp_inst(struct kretprobe_instance *ri) -{ - /* - * Remove rp inst off the free list - - * Add it back when probed function returns - */ - hlist_del(&ri->uflist); - - /* Add rp inst onto table */ - INIT_HLIST_NODE(&ri->hlist); - hlist_add_head(&ri->hlist, - &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); - - /* Also add this rp inst to the used list. */ - INIT_HLIST_NODE(&ri->uflist); - hlist_add_head(&ri->uflist, &ri->rp->used_instances); -} - -/* Called with kretprobe_lock held */ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head) { @@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - while ((ri = get_free_rp_inst(rp)) != NULL) { + struct hlist_node *pos, *next; + + hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { hlist_del(&ri->uflist); kfree(ri); } @@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { - if (addr >= (unsigned long)__kprobes_text_start - && addr < (unsigned long)__kprobes_text_end) + if (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) return -EINVAL; return 0; } @@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); - if ((!kernel_text_address((unsigned long) p->addr)) || - in_kprobes_functions((unsigned long) p->addr)) + if (!kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; p->mod_refcounted = 0; - /* Check are we probing a module */ - if ((probed_mod = module_text_address((unsigned long) p->addr))) { + + /* + * Check if are we probing a module. + */ + probed_mod = module_text_address((unsigned long) p->addr); + if (probed_mod) { struct module *calling_mod = module_text_address(called_from); - /* We must allow modules to probe themself and - * in this case avoid incrementing the module refcount, - * so as to allow unloading of self probing modules. + /* + * We must allow modules to probe themself and in this case + * avoid incrementing the module refcount, so as to allow + * unloading of self probing modules. */ - if (calling_mod && (calling_mod != probed_mod)) { + if (calling_mod && calling_mod != probed_mod) { if (unlikely(!try_module_get(probed_mod))) return -EINVAL; p->mod_refcounted = 1; @@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, goto out; } - if ((ret = arch_prepare_kprobe(p)) != 0) + ret = arch_prepare_kprobe(p); + if (ret) goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (atomic_add_return(1, &kprobe_count) == \ + if (kprobe_enabled) { + if (atomic_add_return(1, &kprobe_count) == \ (ARCH_INACTIVE_KPROBE_COUNT + 1)) - register_page_fault_notifier(&kprobe_page_fault_nb); - - arch_arm_kprobe(p); + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); @@ -616,8 +586,7 @@ out: int __kprobes register_kprobe(struct kprobe *p) { - return __register_kprobe(p, - (unsigned long)__builtin_return_address(0)); + return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_kprobe(struct kprobe *p) @@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) return; } valid_p: - if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) && - (p->list.next == &old_p->list) && - (p->list.prev == &old_p->list))) { - /* Only probe on the hash list */ - arch_disarm_kprobe(p); + if (old_p == p || + (old_p->pre_handler == aggr_pre_handler && + p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + /* + * Only probe on the hash list. Disarm only if kprobes are + * enabled - otherwise, the breakpoint would already have + * been removed. We save on flushing icache. + */ + if (kprobe_enabled) + arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); cleanup_p = 1; } else { @@ -656,9 +630,11 @@ valid_p: mutex_unlock(&kprobe_mutex); synchronize_sched(); - if (p->mod_refcounted && - (mod = module_text_address((unsigned long)p->addr))) - module_put(mod); + if (p->mod_refcounted) { + mod = module_text_address((unsigned long)p->addr); + if (mod) + module_put(mod); + } if (cleanup_p) { if (p != old_p) { @@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ spin_lock_irqsave(&kretprobe_lock, flags); - arch_prepare_kretprobe(rp, regs); + if (!hlist_empty(&rp->free_instances)) { + struct kretprobe_instance *ri; + + ri = hlist_entry(rp->free_instances.first, + struct kretprobe_instance, uflist); + ri->rp = rp; + ri->task = current; + arch_prepare_kretprobe(ri, regs); + + /* XXX(hch): why is there no hlist_move_head? */ + hlist_del(&ri->uflist); + hlist_add_head(&ri->uflist, &ri->rp->used_instances); + hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); + } else + rp->nmissed++; spin_unlock_irqrestore(&kretprobe_lock, flags); return 0; } @@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; + struct hlist_node *pos, *next; unregister_kprobe(&rp->kp); + /* No race here */ spin_lock_irqsave(&kretprobe_lock, flags); - while ((ri = get_used_rp_inst(rp)) != NULL) { + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { ri->rp = NULL; hlist_del(&ri->uflist); } @@ -816,6 +808,9 @@ static int __init init_kprobes(void) } atomic_set(&kprobe_count, 0); + /* By default, kprobes are enabled */ + kprobe_enabled = true; + err = arch_init_kprobes(); if (!err) err = register_die_notifier(&kprobe_exceptions_nb); @@ -825,7 +820,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset,char *modname) { char *kprobe_type; @@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) struct kprobe *p, *kp; const char *sym = NULL; unsigned int i = *(loff_t *) v; - unsigned long size, offset = 0; + unsigned long offset = 0; char *modname, namebuf[128]; head = &kprobe_table[i]; preempt_disable(); hlist_for_each_entry_rcu(p, node, head, hlist) { - sym = kallsyms_lookup((unsigned long)p->addr, &size, + sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); if (p->pre_handler == aggr_pre_handler) { list_for_each_entry_rcu(kp, &p->list, list) @@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; +static void __kprobes enable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already enabled, just return */ + if (kprobe_enabled) + goto already_enabled; + + /* + * Re-register the page fault notifier only if there are any + * active probes at the time of enabling kprobes globally + */ + if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT) + register_page_fault_notifier(&kprobe_page_fault_nb); + + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + arch_arm_kprobe(p); + } + + kprobe_enabled = true; + printk(KERN_INFO "Kprobes globally enabled\n"); + +already_enabled: + mutex_unlock(&kprobe_mutex); + return; +} + +static void __kprobes disable_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already disabled, just return */ + if (!kprobe_enabled) + goto already_disabled; + + kprobe_enabled = false; + printk(KERN_INFO "Kprobes globally disabled\n"); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!arch_trampoline_kprobe(p)) + arch_disarm_kprobe(p); + } + } + + mutex_unlock(&kprobe_mutex); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); + + mutex_lock(&kprobe_mutex); + /* Unconditionally unregister the page_fault notifier */ + unregister_page_fault_notifier(&kprobe_page_fault_nb); + +already_disabled: + mutex_unlock(&kprobe_mutex); + return; +} + +/* + * XXX: The debugfs bool file interface doesn't allow for callbacks + * when the bool state is switched. We can reuse that facility when + * available + */ +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (kprobe_enabled) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + enable_all_kprobes(); + break; + case 'n': + case 'N': + case '0': + disable_all_kprobes(); + break; + } + + return count; +} + +static struct file_operations fops_kp = { + .read = read_enabled_file_bool, + .write = write_enabled_file_bool, +}; + static int __kprobes debugfs_kprobe_init(void) { struct dentry *dir, *file; + unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); if (!dir) return -ENOMEM; - file = debugfs_create_file("list", 0444, dir , 0 , + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_kprobes_operations); if (!file) { debugfs_remove(dir); return -ENOMEM; } + file = debugfs_create_file("enabled", 0600, dir, + &value, &fops_kp); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + return 0; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e0ffe4ab091..559deca5ed1 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \ #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) /* current uevent sequence number */ -static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) +static ssize_t uevent_seqnum_show(struct kset *kset, char *page) { return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); } KERNEL_ATTR_RO(uevent_seqnum); /* uevent helper program, used during early boo */ -static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) +static ssize_t uevent_helper_show(struct kset *kset, char *page) { return sprintf(page, "%s\n", uevent_helper); } -static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) +static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) { if (count+1 > UEVENT_HELPER_PATH_LEN) return -ENOENT; @@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_KEXEC -static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +static ssize_t kexec_loaded_show(struct kset *kset, char *page) { return sprintf(page, "%d\n", !!kexec_image); } KERNEL_ATTR_RO(kexec_loaded); -static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) { return sprintf(page, "%d\n", !!kexec_crash_image); } @@ -85,7 +85,7 @@ static int __init ksysfs_init(void) { int error = subsystem_register(&kernel_subsys); if (!error) - error = sysfs_create_group(&kernel_subsys.kset.kobj, + error = sysfs_create_group(&kernel_subsys.kobj, &kernel_attr_group); return error; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 7065a687ac5..1a5ff2211d8 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace) trace->entries = stack_trace + nr_stack_trace_entries; trace->skip = 3; - trace->all_contexts = 0; - save_stack_trace(trace, NULL); + save_stack_trace(trace); trace->max_entries = trace->nr_entries; @@ -341,10 +340,7 @@ static const char *usage_str[] = const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { - unsigned long offs, size; - char *modname; - - return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str); + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); } void @@ -1313,8 +1309,9 @@ out_unlock_set: /* * Look up a dependency chain. If the key is not present yet then - * add it and return 0 - in this case the new dependency chain is - * validated. If the key is already hashed, return 1. + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) */ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) { @@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, * Mark a lock with a usage bit, and validate the state transition: */ static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, unsigned long ip) + enum lock_usage_bit new_bit) { unsigned int new_mask = 1 << new_bit, ret = 1; @@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, this->class->usage_mask |= new_mask; -#ifdef CONFIG_TRACE_IRQFLAGS - if (new_bit == LOCK_ENABLED_HARDIRQS || - new_bit == LOCK_ENABLED_HARDIRQS_READ) - ip = curr->hardirq_enable_ip; - else if (new_bit == LOCK_ENABLED_SOFTIRQS || - new_bit == LOCK_ENABLED_SOFTIRQS_READ) - ip = curr->softirq_enable_ip; -#endif if (!save_trace(this->class->usage_traces + new_bit)) return 0; @@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) +mark_held_locks(struct task_struct *curr, int hardirq) { enum lock_usage_bit usage_bit; struct held_lock *hlock; @@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip) else usage_bit = LOCK_ENABLED_SOFTIRQS; } - if (!mark_lock(curr, hlock, usage_bit, ip)) + if (!mark_lock(curr, hlock, usage_bit)) return 0; } @@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, 1, ip)) + if (!mark_held_locks(curr, 1)) return; /* * If we have softirqs enabled, then set the usage @@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, 0, ip)) + if (!mark_held_locks(curr, 0)) return; curr->hardirq_enable_ip = ip; @@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, 0, ip); + mark_held_locks(curr, 0); } /* @@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (read) { if (curr->hardirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ, ip)) + LOCK_USED_IN_HARDIRQ_READ)) return 0; if (curr->softirq_context) if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ, ip)) + LOCK_USED_IN_SOFTIRQ_READ)) return 0; } else { if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) return 0; if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip)) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) return 0; } } if (!hardirqs_off) { if (read) { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS_READ, ip)) + LOCK_ENABLED_HARDIRQS_READ)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS_READ, ip)) + LOCK_ENABLED_SOFTIRQS_READ)) return 0; } else { if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQS, ip)) + LOCK_ENABLED_HARDIRQS)) return 0; if (curr->softirqs_enabled) if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQS, ip)) + LOCK_ENABLED_SOFTIRQS)) return 0; } } #endif /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED, ip)) + if (!mark_lock(curr, hlock, LOCK_USED)) return 0; out_calc_hash: /* diff --git a/kernel/module.c b/kernel/module.c index 9da5af668a2..d36e45477fa 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kallsyms.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -45,6 +46,8 @@ #include <asm/cacheflush.h> #include <linux/license.h> +extern int module_sysfs_initialized; + #if 0 #define DEBUGP printk #else @@ -308,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size) { /* Reallocation required? */ if (pcpu_num_used + 1 > pcpu_num_allocated) { - int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2, - GFP_KERNEL); + int *new; + + new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2, + GFP_KERNEL); if (!new) return 0; - memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated); pcpu_num_allocated *= 2; - kfree(pcpu_size); pcpu_size = new; } @@ -346,10 +349,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned int i; void *ptr; - if (align > SMP_CACHE_BYTES) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", - name, align, SMP_CACHE_BYTES); - align = SMP_CACHE_BYTES; + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; } ptr = __per_cpu_start; @@ -430,7 +433,7 @@ static int percpu_modinit(void) pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated, GFP_KERNEL); /* Static in-kernel percpu data (used). */ - pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES); + pcpu_size[0] = -(__per_cpu_end-__per_cpu_start); /* Free room. */ pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0]; if (pcpu_size[1] < 0) { @@ -1117,8 +1120,8 @@ int mod_sysfs_init(struct module *mod) { int err; - if (!module_subsys.kset.subsys) { - printk(KERN_ERR "%s: module_subsys not initialized\n", + if (!module_sysfs_initialized) { + printk(KERN_ERR "%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; @@ -1469,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS -int is_exported(const char *name, const struct module *mod) +static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; @@ -2095,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod, if (!best) return NULL; - *size = nextval - mod->symtab[best].st_value; - *offset = addr - mod->symtab[best].st_value; + if (size) + *size = nextval - mod->symtab[best].st_value; + if (offset) + *offset = addr - mod->symtab[best].st_value; return mod->strtab + mod->symtab[best].st_name; } @@ -2121,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, size_t namelen) +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + strlcpy(symname, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) { + if (within(addr, mod->module_init, mod->init_size) || + within(addr, mod->module_core, mod->core_size)) { + const char *sym; + + sym = get_ksymbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strlcpy(modname, mod->name, MODULE_NAME_LEN + 1); + if (name) + strlcpy(name, sym, KSYM_NAME_LEN + 1); + mutex_unlock(&module_mutex); + return 0; + } + } +out: + mutex_unlock(&module_mutex); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) { struct module *mod; @@ -2132,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - namelen); + KSYM_NAME_LEN + 1); + strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1); + *exported = is_exported(name, mod); mutex_unlock(&module_mutex); - return mod; + return 0; } symnum -= mod->num_symtab; } mutex_unlock(&module_mutex); - return NULL; + return -ERANGE; } static unsigned long mod_find_symname(struct module *mod, const char *name) @@ -2385,7 +2442,7 @@ void module_add_driver(struct module *mod, struct device_driver *drv) struct kobject *mkobj; /* Lookup built-in module entry in /sys/modules */ - mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name); + mkobj = kset_find_obj(&module_subsys, drv->mod_name); if (mkobj) { mk = container_of(mkobj, struct module_kobject, kobj); /* remember our module structure */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5b9ee6f6bb..1bc4b55241a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk) /* * creates a copy of "orig" with refcount 1. - * This does not grab references to the contained namespaces, - * so that needs to be done by dup_namespaces. */ -static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig) { struct nsproxy *ns; @@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) } /* - * copies the nsproxy, setting refcount to 1, and grabbing a - * reference to all contained namespaces. Called from - * sys_unshare() + * Create new nsproxy and all of its the associated namespaces. + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. */ -struct nsproxy *dup_namespaces(struct nsproxy *orig) +static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk, + struct fs_struct *new_fs) { - struct nsproxy *ns = clone_namespaces(orig); + struct nsproxy *new_nsp; - if (ns) { - if (ns->mnt_ns) - get_mnt_ns(ns->mnt_ns); - if (ns->uts_ns) - get_uts_ns(ns->uts_ns); - if (ns->ipc_ns) - get_ipc_ns(ns->ipc_ns); - if (ns->pid_ns) - get_pid_ns(ns->pid_ns); - } + new_nsp = clone_nsproxy(tsk->nsproxy); + if (!new_nsp) + return ERR_PTR(-ENOMEM); - return ns; + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) + goto out_ns; + + new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + if (IS_ERR(new_nsp->uts_ns)) + goto out_uts; + + new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + if (IS_ERR(new_nsp->ipc_ns)) + goto out_ipc; + + new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns); + if (IS_ERR(new_nsp->pid_ns)) + goto out_pid; + + return new_nsp; + +out_pid: + if (new_nsp->ipc_ns) + put_ipc_ns(new_nsp->ipc_ns); +out_ipc: + if (new_nsp->uts_ns) + put_uts_ns(new_nsp->uts_ns); +out_uts: + if (new_nsp->mnt_ns) + put_mnt_ns(new_nsp->mnt_ns); +out_ns: + kfree(new_nsp); + return ERR_PTR(-ENOMEM); } /* @@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk) if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) return 0; - new_ns = clone_namespaces(old_ns); - if (!new_ns) { - err = -ENOMEM; + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; goto out; } - tsk->nsproxy = new_ns; - - err = copy_mnt_ns(flags, tsk); - if (err) - goto out_ns; - - err = copy_utsname(flags, tsk); - if (err) - goto out_uts; - - err = copy_ipcs(flags, tsk); - if (err) - goto out_ipc; - - err = copy_pid_ns(flags, tsk); - if (err) - goto out_pid; + new_ns = create_new_namespaces(flags, tsk, tsk->fs); + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); + goto out; + } + tsk->nsproxy = new_ns; out: put_nsproxy(old_ns); return err; - -out_pid: - if (new_ns->ipc_ns) - put_ipc_ns(new_ns->ipc_ns); -out_ipc: - if (new_ns->uts_ns) - put_uts_ns(new_ns->uts_ns); -out_uts: - if (new_ns->mnt_ns) - put_mnt_ns(new_ns->mnt_ns); -out_ns: - tsk->nsproxy = old_ns; - kfree(new_ns); - goto out; } void free_nsproxy(struct nsproxy *ns) @@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns) put_pid_ns(ns->pid_ns); kfree(ns); } + +/* + * Called from unshare. Unshare all the namespaces part of nsproxy. + * On sucess, returns the new nsproxy and a reference to old nsproxy + * to make sure it stays around. + */ +int unshare_nsproxy_namespaces(unsigned long unshare_flags, + struct nsproxy **new_nsp, struct fs_struct *new_fs) +{ + struct nsproxy *old_ns = current->nsproxy; + int err = 0; + + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + +#ifndef CONFIG_IPC_NS + if (unshare_flags & CLONE_NEWIPC) + return -EINVAL; +#endif + +#ifndef CONFIG_UTS_NS + if (unshare_flags & CLONE_NEWUTS) + return -EINVAL; +#endif + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + get_nsproxy(old_ns); + + *new_nsp = create_new_namespaces(unshare_flags, current, + new_fs ? new_fs : current->fs); + if (IS_ERR(*new_nsp)) { + err = PTR_ERR(*new_nsp); + put_nsproxy(old_ns); + } + return err; +} diff --git a/kernel/params.c b/kernel/params.c index 1fc4ac746cd..e61c46c97ce 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp) return param_get_bool(buffer, &dummy); } -/* We cheat here and temporarily mangle the string. */ +/* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, unsigned int min, unsigned int max, @@ -691,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = { }; decl_subsys(module, &module_ktype, &module_uevent_ops); +int module_sysfs_initialized; static struct kobj_type module_ktype = { .sysfs_ops = &module_sysfs_ops, @@ -709,6 +710,7 @@ static int __init param_sysfs_init(void) __FILE__, __LINE__, ret); return ret; } + module_sysfs_initialized = 1; param_sysfs_builtin(); diff --git a/kernel/pid.c b/kernel/pid.c index 78f2aee90f5..d3ad724afa8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr) } EXPORT_SYMBOL_GPL(find_get_pid); -int copy_pid_ns(int flags, struct task_struct *tsk) +struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns) { - struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; - int err = 0; - - if (!old_ns) - return 0; - + BUG_ON(!old_ns); get_pid_ns(old_ns); - return err; + return old_ns; } void free_pid_ns(struct kref *kref) @@ -412,7 +407,5 @@ void __init pidmap_init(void) set_bit(0, init_pid_ns.pidmap[0].page); atomic_dec(&init_pid_ns.pidmap[0].nr_free); - pid_cachep = kmem_cache_create("pid", sizeof(struct pid), - __alignof__(struct pid), - SLAB_PANIC, NULL, NULL); + pid_cachep = KMEM_CACHE(pid, SLAB_PANIC); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 657f7769741..1de710e1837 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { @@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { @@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk, maxfire = 20; tsk->it_sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->sched_time < t->expires.sched) { @@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { @@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { @@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_entry(timers->next, + struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || sched_time < t->expires.sched) { @@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, */ head = &tsk->signal->cpu_timers[clock_idx]; if (list_empty(head) || - cputime_ge(list_entry(head->next, + cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44318ca7197..588c99da030 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -31,7 +31,6 @@ * POSIX clocks & timers */ #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 51a4dd0f1b7..495b7d4dd33 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED are likely to be bus or driver specific. config SOFTWARE_SUSPEND - bool "Software Suspend" - depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) + bool "Software Suspend (Hibernation)" + depends on PM && SWAP && (((X86 || PPC64_SWSUSP) && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) ---help--- - Enable the suspend to disk (STD) functionality. + Enable the suspend to disk (STD) functionality, which is usually + called "hibernation" in user interfaces. STD checkpoints the + system and powers it off; and restores that checkpoint on reboot. You can suspend your machine with 'echo disk > /sys/power/state'. Alternatively, you can use the additional userland tools available from <http://suspend.sf.net>. In principle it does not require ACPI or APM, although for example - ACPI will be used if available. + ACPI will be used for the final steps when it is available. One + of the reasons to use software suspend is that the firmware hooks + for suspend states like suspend-to-RAM (STR) often don't work very + well with Linux. It creates an image which is saved in your active swap. Upon the next boot, pass the 'resume=/dev/swappartition' argument to the kernel to @@ -134,7 +139,7 @@ config PM_STD_PARTITION config SUSPEND_SMP bool - depends on HOTPLUG_CPU && X86 && PM + depends on HOTPLUG_CPU && (X86 || PPC64) && PM default y config APM_EMULATION diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02e4fb69111..06331374d86 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -130,15 +130,25 @@ int pm_suspend_disk(void) { int error; + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) + return -EBUSY; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Exit; + error = prepare_processes(); if (error) - return error; + goto Finish; if (pm_disk_mode == PM_DISK_TESTPROC) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); goto Thaw; } + /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); if (error) @@ -196,6 +206,10 @@ int pm_suspend_disk(void) resume_console(); Thaw: unprepare_processes(); + Finish: + free_basic_memory_bitmaps(); + Exit: + atomic_inc(&snapshot_device_available); return error; } @@ -239,13 +253,21 @@ static int software_resume(void) } pr_debug("PM: Checking swsusp image.\n"); - error = swsusp_check(); if (error) - goto Done; + goto Unlock; - pr_debug("PM: Preparing processes for restore.\n"); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + error = create_basic_memory_bitmaps(); + if (error) + goto Finish; + + pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); if (error) { swsusp_close(); @@ -280,7 +302,11 @@ static int software_resume(void) printk(KERN_ERR "PM: Restore failed, recovering.\n"); unprepare_processes(); Done: + free_basic_memory_bitmaps(); + Finish: + atomic_inc(&snapshot_device_available); /* For success case, the suspend path will release the lock */ + Unlock: mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return 0; @@ -322,13 +348,40 @@ static const char * const pm_disk_modes[] = { * supports it (as determined from pm_ops->pm_disk_mode). */ -static ssize_t disk_show(struct subsystem * subsys, char * buf) +static ssize_t disk_show(struct kset *kset, char *buf) { - return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]); + int i; + char *start = buf; + + for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { + if (!pm_disk_modes[i]) + continue; + switch (i) { + case PM_DISK_SHUTDOWN: + case PM_DISK_REBOOT: + case PM_DISK_TEST: + case PM_DISK_TESTPROC: + break; + default: + if (pm_ops && pm_ops->enter && + (i == pm_ops->pm_disk_mode)) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == pm_disk_mode) + buf += sprintf(buf, "[%s]", pm_disk_modes[i]); + else + buf += sprintf(buf, "%s", pm_disk_modes[i]); + if (i+1 != PM_DISK_MAX) + buf += sprintf(buf, " "); + } + buf += sprintf(buf, "\n"); + return buf-start; } -static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) +static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) { int error = 0; int i; @@ -373,13 +426,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) power_attr(disk); -static ssize_t resume_show(struct subsystem * subsys, char *buf) +static ssize_t resume_show(struct kset *kset, char *buf) { return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); } -static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) +static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) { unsigned int maj, min; dev_t res; @@ -405,12 +458,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) power_attr(resume); -static ssize_t image_size_show(struct subsystem * subsys, char *buf) +static ssize_t image_size_show(struct kset *kset, char *buf) { return sprintf(buf, "%lu\n", image_size); } -static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) { unsigned long size; @@ -439,7 +492,7 @@ static struct attribute_group attr_group = { static int __init pm_disk_init(void) { - return sysfs_create_group(&power_subsys.kset.kobj,&attr_group); + return sysfs_create_group(&power_subsys.kobj, &attr_group); } core_initcall(pm_disk_init); diff --git a/kernel/power/main.c b/kernel/power/main.c index 72419a3b1be..f6dda685e7e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -184,17 +184,21 @@ static void suspend_finish(suspend_state_t state) static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", -#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", -#endif }; static inline int valid_state(suspend_state_t state) { /* Suspend-to-disk does not really need low-level support. - * It can work with reboot if needed. */ + * It can work with shutdown/reboot if needed. If it isn't + * configured, then it cannot be supported. + */ if (state == PM_SUSPEND_DISK) +#ifdef CONFIG_SOFTWARE_SUSPEND return 1; +#else + return 0; +#endif /* all other states need lowlevel support and need to be * valid to the lowlevel implementation, no valid callback @@ -244,15 +248,6 @@ static int enter_state(suspend_state_t state) return error; } -/* - * This is main interface to the outside world. It needs to be - * called from process context. - */ -int software_suspend(void) -{ - return enter_state(PM_SUSPEND_DISK); -} - /** * pm_suspend - Externally visible function for suspending system. @@ -285,7 +280,7 @@ decl_subsys(power,NULL,NULL); * proper enumerated value, and initiates a suspend transition. */ -static ssize_t state_show(struct subsystem * subsys, char * buf) +static ssize_t state_show(struct kset *kset, char *buf) { int i; char * s = buf; @@ -298,7 +293,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) return (s - buf); } -static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t state_store(struct kset *kset, const char *buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; const char * const *s; @@ -325,13 +320,13 @@ power_attr(state); #ifdef CONFIG_PM_TRACE int pm_trace_enabled; -static ssize_t pm_trace_show(struct subsystem * subsys, char * buf) +static ssize_t pm_trace_show(struct kset *kset, char *buf) { return sprintf(buf, "%d\n", pm_trace_enabled); } static ssize_t -pm_trace_store(struct subsystem * subsys, const char * buf, size_t n) +pm_trace_store(struct kset *kset, const char *buf, size_t n) { int val; @@ -365,7 +360,7 @@ static int __init pm_init(void) { int error = subsystem_register(&power_subsys); if (!error) - error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group); + error = sysfs_create_group(&power_subsys.kobj,&attr_group); return error; } diff --git a/kernel/power/power.h b/kernel/power/power.h index eb461b816bf..34b43542785 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -14,8 +14,18 @@ struct swsusp_info { #ifdef CONFIG_SOFTWARE_SUSPEND -extern int pm_suspend_disk(void); +/* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] + */ +#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) +/* + * Keep 1 MB of memory free so that device drivers can allocate some pages in + * their .suspend() routines without breaking the suspend to disk. + */ +#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) +extern int pm_suspend_disk(void); #else static inline int pm_suspend_disk(void) { @@ -23,6 +33,8 @@ static inline int pm_suspend_disk(void) } #endif +extern int pfn_is_nosave(unsigned long); + extern struct mutex pm_mutex; #define power_attr(_name) \ @@ -35,10 +47,7 @@ static struct subsys_attribute _name##_attr = { \ .store = _name##_store, \ } -extern struct subsystem power_subsys; - -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern struct kset power_subsys; /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; @@ -49,6 +58,8 @@ extern sector_t swsusp_resume_block; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern int create_basic_memory_bitmaps(void); +extern void free_basic_memory_bitmaps(void); extern unsigned int count_data_pages(void); /** @@ -139,30 +150,12 @@ struct resume_swap_area { #define PMOPS_ENTER 2 #define PMOPS_FINISH 3 -/** - * The bitmap is used for tracing allocated swap pages - * - * The entire bitmap consists of a number of bitmap_page - * structures linked with the help of the .next member. - * Thus each page can be allocated individually, so we only - * need to make 0-order memory allocations to create - * the bitmap. - */ - -#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *)) -#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long)) -#define BITS_PER_CHUNK (sizeof(long) * 8) -#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK) - -struct bitmap_page { - unsigned long chunks[BITMAP_PAGE_CHUNKS]; - struct bitmap_page *next; -}; +/* If unset, the snapshot device cannot be open. */ +extern atomic_t snapshot_device_available; -extern void free_bitmap(struct bitmap_page *bitmap); -extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); -extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); -extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); +extern sector_t alloc_swapdev_block(int swap); +extern void free_all_swap_pages(int swap); +extern int swsusp_swap_in_use(void); extern int swsusp_check(void); extern int swsusp_shrink_memory(void); diff --git a/kernel/power/process.c b/kernel/power/process.c index 6d566bf7085..08841938738 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -8,7 +8,6 @@ #undef DEBUG -#include <linux/smp_lock.h> #include <linux/interrupt.h> #include <linux/suspend.h> #include <linux/module.h> @@ -25,10 +24,9 @@ static inline int freezeable(struct task_struct * p) { - if ((p == current) || + if ((p == current) || (p->flags & PF_NOFREEZE) || - (p->exit_state == EXIT_ZOMBIE) || - (p->exit_state == EXIT_DEAD)) + (p->exit_state != 0)) return 0; return 1; } @@ -47,8 +45,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) { - current->state = TASK_UNINTERRUPTIBLE; + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; schedule(); } pr_debug("%s left refrigerator\n", current->comm); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fc53ad06812..b7039772b05 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -14,13 +14,13 @@ #include <linux/module.h> #include <linux/mm.h> #include <linux/suspend.h> -#include <linux/smp_lock.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/pm.h> #include <linux/device.h> +#include <linux/init.h> #include <linux/bootmem.h> #include <linux/syscalls.h> #include <linux/console.h> @@ -34,6 +34,10 @@ #include "power.h" +static int swsusp_page_is_free(struct page *); +static void swsusp_set_page_forbidden(struct page *); +static void swsusp_unset_page_forbidden(struct page *); + /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been * allocated by the "resume" kernel, so their contents cannot be written @@ -67,15 +71,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed) res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - while (res && PageNosaveFree(virt_to_page(res))) { + while (res && swsusp_page_is_free(virt_to_page(res))) { /* The page is unsafe, mark it for swsusp_free() */ - SetPageNosave(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); allocated_unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); } if (res) { - SetPageNosave(virt_to_page(res)); - SetPageNosaveFree(virt_to_page(res)); + swsusp_set_page_forbidden(virt_to_page(res)); + swsusp_set_page_free(virt_to_page(res)); } return res; } @@ -91,8 +95,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask) page = alloc_page(gfp_mask); if (page) { - SetPageNosave(page); - SetPageNosaveFree(page); + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); } return page; } @@ -110,9 +114,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free) page = virt_to_page(addr); - ClearPageNosave(page); + swsusp_unset_page_forbidden(page); if (clear_nosave_free) - ClearPageNosaveFree(page); + swsusp_unset_page_free(page); __free_page(page); } @@ -224,11 +228,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) * of type unsigned long each). It also contains the pfns that * correspond to the start and end of the represented memory area and * the number of bit chunks in the block. - * - * NOTE: Memory bitmaps are used for two types of operations only: - * "set a bit" and "find the next bit set". Moreover, the searching - * is always carried out after all of the "set a bit" operations - * on given bitmap. */ #define BM_END_OF_MAP (~0UL) @@ -443,15 +442,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) } /** - * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds + * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. - * - * If the bit cannot be set, the function returns -EINVAL . */ -static int -memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; struct bm_block *bb; @@ -463,8 +460,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) /* We don't assume that the zones are sorted by pfns */ while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - if (unlikely(!zone_bm)) - return -EINVAL; + + BUG_ON(!zone_bm); } bm->cur.zone_bm = zone_bm; } @@ -475,13 +472,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) while (pfn >= bb->end_pfn) { bb = bb->next; - if (unlikely(!bb)) - return -EINVAL; + + BUG_ON(!bb); } zone_bm->cur_block = bb; pfn -= bb->start_pfn; - set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK); - return 0; + *bit_nr = pfn % BM_BITS_PER_CHUNK; + *addr = bb->data + pfn / BM_BITS_PER_CHUNK; +} + +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + set_bit(bit, addr); +} + +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + clear_bit(bit, addr); +} + +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + memory_bm_find_bit(bm, pfn, &addr, &bit); + return test_bit(bit, addr); } /* Two auxiliary functions for memory_bm_next_pfn */ @@ -564,6 +588,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) } /** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); + +/** + * register_nosave_region - register a range of page frames the contents + * of which should not be saved during the suspend (to be used in the early + * initialization code) + */ + +void __init +register_nosave_region(unsigned long start_pfn, unsigned long end_pfn) +{ + struct nosave_region *region; + + if (start_pfn >= end_pfn) + return; + + if (!list_empty(&nosave_regions)) { + /* Try to extend the previous region (they should be sorted) */ + region = list_entry(nosave_regions.prev, + struct nosave_region, list); + if (region->end_pfn == start_pfn) { + region->end_pfn = end_pfn; + goto Report; + } + } + /* This allocation cannot fail */ + region = alloc_bootmem_low(sizeof(struct nosave_region)); + region->start_pfn = start_pfn; + region->end_pfn = end_pfn; + list_add_tail(®ion->list, &nosave_regions); + Report: + printk("swsusp: Registered nosave memory region: %016lx - %016lx\n", + start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +} + +/* + * Set bits in this map correspond to the page frames the contents of which + * should not be saved during the suspend. + */ +static struct memory_bitmap *forbidden_pages_map; + +/* Set bits in this map correspond to free page frames. */ +static struct memory_bitmap *free_pages_map; + +/* + * Each page frame allocated for creating the image is marked by setting the + * corresponding bits in forbidden_pages_map and free_pages_map simultaneously + */ + +void swsusp_set_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); +} + +static int swsusp_page_is_free(struct page *page) +{ + return free_pages_map ? + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; +} + +void swsusp_unset_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); +} + +static void swsusp_set_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); +} + +int swsusp_page_is_forbidden(struct page *page) +{ + return forbidden_pages_map ? + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; +} + +static void swsusp_unset_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); +} + +/** + * mark_nosave_pages - set bits corresponding to the page frames the + * contents of which should not be saved in a given bitmap. + */ + +static void mark_nosave_pages(struct memory_bitmap *bm) +{ + struct nosave_region *region; + + if (list_empty(&nosave_regions)) + return; + + list_for_each_entry(region, &nosave_regions, list) { + unsigned long pfn; + + printk("swsusp: Marking nosave pages: %016lx - %016lx\n", + region->start_pfn << PAGE_SHIFT, + region->end_pfn << PAGE_SHIFT); + + for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) + memory_bm_set_bit(bm, pfn); + } +} + +/** + * create_basic_memory_bitmaps - create bitmaps needed for marking page + * frames that should not be saved and free page frames. The pointers + * forbidden_pages_map and free_pages_map are only modified if everything + * goes well, because we don't want the bits to be used before both bitmaps + * are set up. + */ + +int create_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + int error = 0; + + BUG_ON(forbidden_pages_map || free_pages_map); + + bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm1) + return -ENOMEM; + + error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); + if (error) + goto Free_first_object; + + bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm2) + goto Free_first_bitmap; + + error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); + if (error) + goto Free_second_object; + + forbidden_pages_map = bm1; + free_pages_map = bm2; + mark_nosave_pages(forbidden_pages_map); + + printk("swsusp: Basic memory bitmaps created\n"); + + return 0; + + Free_second_object: + kfree(bm2); + Free_first_bitmap: + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + Free_first_object: + kfree(bm1); + return -ENOMEM; +} + +/** + * free_basic_memory_bitmaps - free memory bitmaps allocated by + * create_basic_memory_bitmaps(). The auxiliary pointers are necessary + * so that the bitmaps themselves are not referred to while they are being + * freed. + */ + +void free_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + + BUG_ON(!(forbidden_pages_map && free_pages_map)); + + bm1 = forbidden_pages_map; + bm2 = free_pages_map; + forbidden_pages_map = NULL; + free_pages_map = NULL; + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + kfree(bm1); + memory_bm_free(bm2, PG_UNSAFE_CLEAR); + kfree(bm2); + + printk("swsusp: Basic memory bitmaps freed\n"); +} + +/** * snapshot_additional_pages - estimate the number of additional pages * be needed for setting up the suspend image data structures for given * zone (usually the returned value is greater than the exact number) @@ -615,7 +832,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) BUG_ON(!PageHighMem(page)); - if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || + PageReserved(page)) return NULL; return page; @@ -651,17 +869,6 @@ static inline unsigned int count_highmem_pages(void) { return 0; } #endif /* CONFIG_HIGHMEM */ /** - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -static inline int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - -/** * saveable - Determine whether a non-highmem page should be included in * the suspend image. * @@ -681,7 +888,7 @@ static struct page *saveable_page(unsigned long pfn) BUG_ON(PageHighMem(page)); - if (PageNosave(page) || PageNosaveFree(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; if (PageReserved(page) && pfn_is_nosave(pfn)) @@ -821,9 +1028,10 @@ void swsusp_free(void) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); - if (PageNosave(page) && PageNosaveFree(page)) { - ClearPageNosave(page); - ClearPageNosaveFree(page); + if (swsusp_page_is_forbidden(page) && + swsusp_page_is_free(page)) { + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); __free_page(page); } } @@ -1146,7 +1354,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) - ClearPageNosaveFree(pfn_to_page(pfn)); + swsusp_unset_page_free(pfn_to_page(pfn)); } /* Mark pages that correspond to the "original" pfns as "unsafe" */ @@ -1155,7 +1363,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { if (likely(pfn_valid(pfn))) - SetPageNosaveFree(pfn_to_page(pfn)); + swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; } @@ -1321,14 +1529,14 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) struct page *page; page = alloc_page(__GFP_HIGHMEM); - if (!PageNosaveFree(page)) { + if (!swsusp_page_is_free(page)) { /* The page is "safe", set its bit the bitmap */ memory_bm_set_bit(bm, page_to_pfn(page)); safe_highmem_pages++; } /* Mark the page as allocated */ - SetPageNosave(page); - SetPageNosaveFree(page); + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); } memory_bm_position_reset(bm); safe_highmem_bm = bm; @@ -1360,7 +1568,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) struct highmem_pbe *pbe; void *kaddr; - if (PageNosave(page) && PageNosaveFree(page)) { + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { /* We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ @@ -1522,14 +1730,14 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) error = -ENOMEM; goto Free; } - if (!PageNosaveFree(virt_to_page(lp))) { + if (!swsusp_page_is_free(virt_to_page(lp))) { /* The page is "safe", add it to the list */ lp->next = safe_pages_list; safe_pages_list = lp; } /* Mark the page as allocated */ - SetPageNosave(virt_to_page(lp)); - SetPageNosaveFree(virt_to_page(lp)); + swsusp_set_page_forbidden(virt_to_page(lp)); + swsusp_set_page_free(virt_to_page(lp)); nr_pages--; } /* Free the reserved safe pages so that chain_alloc() can use them */ @@ -1558,7 +1766,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); - if (PageNosave(page) && PageNosaveFree(page)) + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) /* We have allocated the "original" page frame and we can * use it directly to store the loaded page. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 3581f8f86ac..b8b235cc19d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -12,7 +12,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/utsname.h> #include <linux/version.h> @@ -33,12 +32,14 @@ extern char resume_file[]; #define SWSUSP_SIG "S1SUSPEND" -static struct swsusp_header { +struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; sector_t image; char orig_sig[10]; char sig[10]; -} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; +} __attribute__((packed)); + +static struct swsusp_header *swsusp_header; /* * General things @@ -141,14 +142,14 @@ static int mark_swapfiles(sector_t start) { int error; - bio_read_page(swsusp_resume_block, &swsusp_header, NULL); - if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || - !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { - memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); - memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - swsusp_header.image = start; + bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { + memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); + memcpy(swsusp_header->sig,SWSUSP_SIG, 10); + swsusp_header->image = start; error = bio_write_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); } else { printk(KERN_ERR "swsusp: Swap header not found!\n"); error = -ENODEV; @@ -241,7 +242,6 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; sector_t cur_swap; - struct bitmap_page *bitmap; unsigned int k; }; @@ -250,9 +250,6 @@ static void release_swap_writer(struct swap_map_handle *handle) if (handle->cur) free_page((unsigned long)handle->cur); handle->cur = NULL; - if (handle->bitmap) - free_bitmap(handle->bitmap); - handle->bitmap = NULL; } static int get_swap_writer(struct swap_map_handle *handle) @@ -260,12 +257,7 @@ static int get_swap_writer(struct swap_map_handle *handle) handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); if (!handle->cur) return -ENOMEM; - handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0)); - if (!handle->bitmap) { - release_swap_writer(handle); - return -ENOMEM; - } - handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap); + handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { release_swap_writer(handle); return -ENOSPC; @@ -282,7 +274,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, if (!handle->cur) return -EINVAL; - offset = alloc_swapdev_block(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap); error = write_page(buf, offset, bio_chain); if (error) return error; @@ -291,7 +283,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = wait_on_bio_chain(bio_chain); if (error) goto out; - offset = alloc_swapdev_block(root_swap, handle->bitmap); + offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; @@ -428,7 +420,8 @@ int swsusp_write(void) } } if (error) - free_all_swap_pages(root_swap, handle.bitmap); + free_all_swap_pages(root_swap); + release_swap_writer(&handle); out: swsusp_close(); @@ -564,7 +557,7 @@ int swsusp_read(void) if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header.image); + error = get_swap_reader(&handle, swsusp_header->image); if (!error) error = swap_read_page(&handle, header, NULL); if (!error) @@ -591,17 +584,17 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - memset(&swsusp_header, 0, sizeof(swsusp_header)); + memset(swsusp_header, 0, sizeof(PAGE_SIZE)); error = bio_read_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); if (error) return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ error = bio_write_page(swsusp_resume_block, - &swsusp_header, NULL); + swsusp_header, NULL); } else { return -EINVAL; } @@ -632,3 +625,13 @@ void swsusp_close(void) blkdev_put(resume_bdev); } + +static int swsusp_header_init(void) +{ + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); + if (!swsusp_header) + panic("Could not allocate memory for swsusp_header\n"); + return 0; +} + +core_initcall(swsusp_header_init); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 175370824f3..5da304c8f1f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -50,6 +50,7 @@ #include <linux/syscalls.h> #include <linux/highmem.h> #include <linux/time.h> +#include <linux/rbtree.h> #include "power.h" @@ -74,72 +75,69 @@ static inline unsigned int count_highmem_pages(void) { return 0; } /** * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. - * - * The functions operate on a linked bitmap structure defined - * in power.h */ -void free_bitmap(struct bitmap_page *bitmap) -{ - struct bitmap_page *bp; +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; - while (bitmap) { - bp = bitmap->next; - free_page((unsigned long)bitmap); - bitmap = bp; - } -} +static struct rb_root swsusp_extents = RB_ROOT; -struct bitmap_page *alloc_bitmap(unsigned int nr_bits) +static int swsusp_extents_insert(unsigned long swap_offset) { - struct bitmap_page *bitmap, *bp; - unsigned int n; - - if (!nr_bits) - return NULL; - - bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); - bp = bitmap; - for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) { - bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL); - bp = bp->next; - if (!bp) { - free_bitmap(bitmap); - return NULL; + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = container_of(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; } } - return bitmap; -} - -static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) -{ - unsigned int n; - - n = BITMAP_PAGE_BITS; - while (bitmap && n <= bit) { - n += BITMAP_PAGE_BITS; - bitmap = bitmap->next; - } - if (!bitmap) - return -EINVAL; - n -= BITMAP_PAGE_BITS; - bit -= n; - n = 0; - while (bit >= BITS_PER_CHUNK) { - bit -= BITS_PER_CHUNK; - n++; - } - bitmap->chunks[n] |= (1UL << bit); + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); return 0; } -sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) { unsigned long offset; offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { - if (bitmap_set(bitmap, offset)) + if (swsusp_extents_insert(offset)) swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); @@ -147,23 +145,34 @@ sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) return 0; } -void free_all_swap_pages(int swap, struct bitmap_page *bitmap) +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entres had been + * allocated. + */ + +void free_all_swap_pages(int swap) { - unsigned int bit, n; - unsigned long test; - - bit = 0; - while (bitmap) { - for (n = 0; n < BITMAP_PAGE_CHUNKS; n++) - for (test = 1UL; test; test <<= 1) { - if (bitmap->chunks[n] & test) - swap_free(swp_entry(swap, bit)); - bit++; - } - bitmap = bitmap->next; + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); } } +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} + /** * swsusp_show_speed - print the time elapsed between two events represented by * @start and @stop @@ -224,7 +233,7 @@ int swsusp_shrink_memory(void) long size, highmem_size; highmem_size = count_highmem_pages(); - size = count_data_pages() + PAGES_FOR_IO; + size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES; tmp = size; size += highmem_size; for_each_zone (zone) diff --git a/kernel/power/user.c b/kernel/power/user.c index 7cf6713b232..040560d9c31 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -33,25 +33,29 @@ static struct snapshot_data { struct snapshot_handle handle; int swap; - struct bitmap_page *bitmap; int mode; char frozen; char ready; char platform_suspend; } snapshot_state; -static atomic_t device_available = ATOMIC_INIT(1); +atomic_t snapshot_device_available = ATOMIC_INIT(1); static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; - if (!atomic_add_unless(&device_available, -1, 0)) + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) return -EBUSY; - if ((filp->f_flags & O_ACCMODE) == O_RDWR) + if ((filp->f_flags & O_ACCMODE) == O_RDWR) { + atomic_inc(&snapshot_device_available); return -ENOSYS; - + } + if(create_basic_memory_bitmaps()) { + atomic_inc(&snapshot_device_available); + return -ENOMEM; + } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; @@ -64,7 +68,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; } - data->bitmap = NULL; data->frozen = 0; data->ready = 0; data->platform_suspend = 0; @@ -77,16 +80,15 @@ static int snapshot_release(struct inode *inode, struct file *filp) struct snapshot_data *data; swsusp_free(); + free_basic_memory_bitmaps(); data = filp->private_data; - free_all_swap_pages(data->swap, data->bitmap); - free_bitmap(data->bitmap); + free_all_swap_pages(data->swap); if (data->frozen) { mutex_lock(&pm_mutex); thaw_processes(); - enable_nonboot_cpus(); mutex_unlock(&pm_mutex); } - atomic_inc(&device_available); + atomic_inc(&snapshot_device_available); return 0; } @@ -294,14 +296,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -ENODEV; break; } - if (!data->bitmap) { - data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0)); - if (!data->bitmap) { - error = -ENOMEM; - break; - } - } - offset = alloc_swapdev_block(data->swap, data->bitmap); + offset = alloc_swapdev_block(data->swap); if (offset) { offset <<= PAGE_SHIFT; error = put_user(offset, (sector_t __user *)arg); @@ -315,13 +310,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -ENODEV; break; } - free_all_swap_pages(data->swap, data->bitmap); - free_bitmap(data->bitmap); - data->bitmap = NULL; + free_all_swap_pages(data->swap); break; case SNAPSHOT_SET_SWAP_FILE: - if (!data->bitmap) { + if (!swsusp_swap_in_use()) { /* * User space encodes device types as two-byte values, * so we need to recode them @@ -420,7 +413,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_SET_SWAP_AREA: - if (data->bitmap) { + if (swsusp_swap_in_use()) { error = -EPERM; } else { struct resume_swap_area swap_area; diff --git a/kernel/printk.c b/kernel/printk.c index 4b47e59248d..0bbdeac2810 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -20,7 +20,6 @@ #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> -#include <linux/smp_lock.h> #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> @@ -931,8 +930,16 @@ void register_console(struct console *console) { int i; unsigned long flags; + struct console *bootconsole = NULL; - if (preferred_console < 0) + if (console_drivers) { + if (console->flags & CON_BOOT) + return; + if (console_drivers->flags & CON_BOOT) + bootconsole = console_drivers; + } + + if (preferred_console < 0 || bootconsole || !console_drivers) preferred_console = selected_console; /* @@ -978,8 +985,11 @@ void register_console(struct console *console) if (!(console->flags & CON_ENABLED)) return; - if (console_drivers && (console_drivers->flags & CON_BOOT)) { - unregister_console(console_drivers); + if (bootconsole) { + printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", + bootconsole->name, bootconsole->index, + console->name, console->index); + unregister_console(bootconsole); console->flags &= ~CON_PRINTBUFFER; } @@ -1030,16 +1040,11 @@ int unregister_console(struct console *console) } } - /* If last console is removed, we re-enable picking the first - * one that gets registered. Without that, pmac early boot console - * would prevent fbcon from taking over. - * + /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. */ - if (console_drivers == NULL) - preferred_console = selected_console; - else if (console->flags & CON_CONSDEV) + if (console_drivers != NULL && console->flags & CON_CONSDEV) console_drivers->flags |= CON_CONSDEV; release_console_sem(); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index bcd14e83ef3..55ba82a85a6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops, - &sched_ops, NULL }; - /* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure @@ -534,7 +530,7 @@ rcu_torture_writer(void *arg) rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); - if (old_rp != NULL) { + if (old_rp) { i = old_rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -685,7 +681,7 @@ rcu_torture_printk(char *page) atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats != NULL) + if (cur_ops->stats) cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void) set_cpus_allowed(current, tmp_mask); - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) set_cpus_allowed(reader_tasks[i], tmp_mask); } - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) set_cpus_allowed(fakewriter_tasks[i], tmp_mask); @@ -808,21 +804,21 @@ rcu_torture_cleanup(void) int i; fullstop = 1; - if (shuffler_task != NULL) { + if (shuffler_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); kthread_stop(shuffler_task); } shuffler_task = NULL; - if (writer_task != NULL) { + if (writer_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); kthread_stop(writer_task); } writer_task = NULL; - if (reader_tasks != NULL) { + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i] != NULL) { + if (reader_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_reader task"); kthread_stop(reader_tasks[i]); @@ -834,9 +830,9 @@ rcu_torture_cleanup(void) } rcu_torture_current = NULL; - if (fakewriter_tasks != NULL) { + if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i] != NULL) { + if (fakewriter_tasks[i]) { VERBOSE_PRINTK_STRING( "Stopping rcu_torture_fakewriter task"); kthread_stop(fakewriter_tasks[i]); @@ -847,7 +843,7 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - if (stats_task != NULL) { + if (stats_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); kthread_stop(stats_task); } @@ -858,7 +854,7 @@ rcu_torture_cleanup(void) rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - if (cur_ops->cleanup != NULL) + if (cur_ops->cleanup) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); @@ -866,27 +862,28 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms("End of test: SUCCESS"); } -static int +static int __init rcu_torture_init(void) { int i; int cpu; int firsterr = 0; + static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &sched_ops, }; /* Process args and tell the world that the torturer is on the job. */ - - for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) { + if (strcmp(torture_type, cur_ops->name) == 0) break; - } } - if (cur_ops == NULL) { + if (i == ARRAY_SIZE(torture_ops)) { printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", torture_type); return (-EINVAL); } - if (cur_ops->init != NULL) + if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ if (nreaders >= 0) @@ -899,7 +896,7 @@ rcu_torture_init(void) /* Set up the freelist. */ INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { rcu_tortures[i].rtort_mbtest = 0; list_add_tail(&rcu_tortures[i].rtort_free, &rcu_torture_freelist); diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 291ded556aa..9a87886b022 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c @@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem) int ret = __down_write_trylock(sem); if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); return ret; } diff --git a/kernel/sched.c b/kernel/sched.c index 960d7c5fca3..a3a04085e79 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -52,8 +52,9 @@ #include <linux/tsacct_kern.h> #include <linux/kprobes.h> #include <linux/delayacct.h> -#include <asm/tlb.h> +#include <linux/reciprocal_div.h> +#include <asm/tlb.h> #include <asm/unistd.h> /* @@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void) (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) #define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) + (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active)) #define SCALE_PRIO(x, prio) \ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) @@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +#ifdef CONFIG_SMP +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] @@ -223,6 +245,10 @@ struct rq { unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; +#endif #endif unsigned long long nr_switches; @@ -278,7 +304,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; static inline int cpu_of(struct rq *rq) { @@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p) if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); +} #else static inline void resched_task(struct task_struct *p) { @@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); if (local_group) { this_load = avg_load; @@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p) struct sched_domain *sd; int i; - if (idle_cpu(cpu)) + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } total_load += avg_load; - total_pwr += group->cpu_power; + total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); - group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { this_load = avg_load; @@ -2468,8 +2516,8 @@ group_next: max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->cpu_power, - (avg_load - this_load) * this->cpu_power) + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -2503,28 +2551,29 @@ small_imbalance: * moving them. */ - pwr_now += busiest->cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->cpu_power * - min(this_load_per_task, this_load); + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - busiest->cpu_power; + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); if (max_load > tmp) - pwr_move += busiest->cpu_power * + pwr_move += busiest->__cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->cpu_power < + if (max_load * busiest->__cpu_power < busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = max_load * busiest->cpu_power / this->cpu_power; + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); else - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - this->cpu_power; - pwr_move += this->cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2657,6 +2706,12 @@ redo: double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + /* + * some other cpu did the load balance for us. + */ + if (nr_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); @@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq) } } +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), + .cpu_mask = CPU_MASK_NONE, +}; + /* - * run_rebalance_domains is triggered when needed from the scheduler tick. + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. + * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_set(cpu, nohz.cpu_mask); + cpu_rq(cpu)->in_nohz_recently = 1; + + /* + * If we are going offline and still the leader, give up! + */ + if (cpu_is_offline(cpu) && + atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + return 0; + } + + /* time for ilb owner also to sleep */ + if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpu_isset(cpu, nohz.cpu_mask)) + return 0; + + cpu_clear(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ -static DEFINE_SPINLOCK(balancing); - -static void run_rebalance_domains(struct softirq_action *h) +static inline void rebalance_domains(int cpu, enum idle_type idle) { - int this_cpu = smp_processor_id(), balance = 1; - struct rq *this_rq = cpu_rq(this_cpu); + int balance = 1; + struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; - /* - * We are idle if there are no processes running. This - * is valid even if we are the idle process (SMT). - */ - enum idle_type idle = !this_rq->nr_running ? - SCHED_IDLE : NOT_IDLE; - /* Earliest time when we have to call run_rebalance_domains again */ + /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; - for_each_domain(this_cpu, sd) { + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2995,7 +3116,114 @@ out: if (!balance) break; } - this_rq->next_balance = next_balance; + rq->next_balance = next_balance; +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int local_cpu = smp_processor_id(); + struct rq *local_rq = cpu_rq(local_cpu); + enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE; + + rebalance_domains(local_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (local_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == local_cpu) { + cpumask_t cpus = nohz.cpu_mask; + struct rq *rq; + int balance_cpu; + + cpu_clear(local_cpu, cpus); + for_each_cpu_mask(balance_cpu, cpus) { + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, SCHED_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(local_rq->next_balance, rq->next_balance)) + local_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(int cpu) +{ + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpu_clear(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = first_cpu(nohz.cpu_mask); + + if (ilb != NR_CPUS) + resched_cpu(ilb); + } + } + + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; + } + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpu_isset(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); } #else /* @@ -3218,16 +3446,17 @@ void scheduler_tick(void) unsigned long long now = sched_clock(); struct task_struct *p = current; int cpu = smp_processor_id(); + int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); update_cpu_clock(p, rq, now); - if (p != rq->idle) + if (!idle_at_tick) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); + rq->idle_at_tick = idle_at_tick; + trigger_load_balance(cpu); #endif } @@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio) struct prio_array *array; unsigned long flags; struct rq *rq; - int oldprio; + int delta; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - oldprio = p->prio; + delta = prio - p->prio; array = p->array; if (array) dequeue_task(p, array); @@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(p, array); /* * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's + * our priority decreased, or if our priority became higher + * than the current's. */ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) + if (TASK_PREEMPTS_CURR(p, rq) || + (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } task_rq_unlock(rq, &flags); @@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice) enqueue_task(p, array); inc_raw_weighted_load(rq, p); /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if our priority became higher + * than the current's. */ - if (delta < 0 || (delta > 0 && task_running(rq, p))) + if (TASK_PREEMPTS_CURR(p, rq) || + (delta > 0 && task_running(rq, p))) resched_task(rq->curr); } out_unlock: @@ -4153,13 +4382,11 @@ recheck: __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's + * our priority decreased, or our priority became higher + * than the current's. */ - if (task_running(rq, p)) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else if (TASK_PREEMPTS_CURR(p, rq)) + if (TASK_PREEMPTS_CURR(p, rq) || + (task_running(rq, p) && p->prio > oldprio)) resched_task(rq->curr); } __task_rq_unlock(rq); @@ -4750,6 +4977,8 @@ void show_state_filter(unsigned long state_filter) show_task(p); } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -5244,6 +5473,11 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP + +/* Number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + #undef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) @@ -5299,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } - if (!group->cpu_power) { + if (!group->__cpu_power) { printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5476,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, continue; sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; + sg->__cpu_power = 0; for_each_cpu_mask(j, span) { if (group_fn(j, cpu_map, NULL) != group) @@ -6165,7 +6399,7 @@ next_sg: continue; } - sg->cpu_power += sd->groups->cpu_power; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); } sg = sg->next; if (sg != group_head) @@ -6240,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; + sd->groups->__cpu_power = 0; + /* * For perf policy, if the groups in child domain share resources * (for example cores sharing some portions of the cache hierarchy @@ -6250,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && (child->flags & (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); return; } - sd->groups->cpu_power = 0; - /* * add cpu_power of each child group to this groups cpu_power */ group = child->groups; do { - sd->groups->cpu_power += group->cpu_power; + sg_inc_cpu_power(sd->groups, group->__cpu_power); group = group->next; } while (group != child->groups); } @@ -6421,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(node_domains, j); sd->groups = sg; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = nodemask; sg->next = sg; cpus_or(covered, covered, nodemask); @@ -6449,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) "Can not alloc domain group for node %d\n", j); goto error; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = tmp; sg->next = prev->next; cpus_or(covered, covered, tmp); @@ -6726,6 +6960,7 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { int i, j, k; + int highest_cpu = 0; for_each_possible_cpu(i) { struct prio_array *array; @@ -6760,11 +6995,13 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } + highest_cpu = i; } set_load_weight(&init_task); #ifdef CONFIG_SMP + nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); #endif diff --git a/kernel/signal.c b/kernel/signal.c index 3670225ecbc..1368e67c848 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -12,7 +12,6 @@ #include <linux/slab.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/fs.h> @@ -2636,9 +2635,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { - sigqueue_cachep = - kmem_cache_create("sigqueue", - sizeof(struct sigqueue), - __alignof__(struct sigqueue), - SLAB_PANIC, NULL, NULL); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 50afeb81330..8fa7040247a 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -34,12 +34,32 @@ static struct notifier_block panic_block = { .notifier_call = softlock_panic, }; +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(void) +{ + return sched_clock() >> 30; /* 2^30 ~= 10^9 */ +} + void touch_softlockup_watchdog(void) { - __raw_get_cpu_var(touch_timestamp) = jiffies; + __raw_get_cpu_var(touch_timestamp) = get_timestamp(); } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* Cause each CPU to re-update its timestamp rather than complain */ + for_each_online_cpu(cpu) + per_cpu(touch_timestamp, cpu) = 0; +} +EXPORT_SYMBOL(touch_all_softlockup_watchdogs); + /* * This callback runs from the timer interrupt, and checks * whether the watchdog thread has hung or not: @@ -48,9 +68,18 @@ void softlockup_tick(void) { int this_cpu = smp_processor_id(); unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); + unsigned long print_timestamp; + unsigned long now; + + if (touch_timestamp == 0) { + touch_softlockup_watchdog(); + return; + } + + print_timestamp = per_cpu(print_timestamp, this_cpu); - /* prevent double reports: */ - if (per_cpu(print_timestamp, this_cpu) == touch_timestamp || + /* report at most once a second */ + if (print_timestamp < (touch_timestamp + 1) || did_panic || !per_cpu(watchdog_task, this_cpu)) return; @@ -61,12 +90,14 @@ void softlockup_tick(void) return; } + now = get_timestamp(); + /* Wake up the high-prio watchdog task every second: */ - if (time_after(jiffies, touch_timestamp + HZ)) + if (now > (touch_timestamp + 1)) wake_up_process(per_cpu(watchdog_task, this_cpu)); /* Warn about unreasonable 10+ seconds delays: */ - if (time_after(jiffies, touch_timestamp + 10*HZ)) { + if (now > (touch_timestamp + 10)) { per_cpu(print_timestamp, this_cpu) = touch_timestamp; spin_lock(&print_lock); @@ -82,11 +113,14 @@ void softlockup_tick(void) */ static int watchdog(void * __bind_cpu) { - struct sched_param param = { .sched_priority = 99 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; sched_setscheduler(current, SCHED_FIFO, ¶m); current->flags |= PF_NOFREEZE; + /* initialize timestamp */ + touch_softlockup_watchdog(); + /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 10 seconds then the @@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) printk("watchdog for %i failed\n", hotcpu); return NOTIFY_BAD; } - per_cpu(touch_timestamp, hotcpu) = jiffies; + per_cpu(touch_timestamp, hotcpu) = 0; per_cpu(watchdog_task, hotcpu) = p; kthread_bind(p, hotcpu); break; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12458040e66..daabb74ee0b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,11 +1,12 @@ /* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ -#include <linux/stop_machine.h> -#include <linux/kthread.h> -#include <linux/sched.h> #include <linux/cpu.h> #include <linux/err.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/stop_machine.h> #include <linux/syscalls.h> #include <asm/atomic.h> #include <asm/semaphore.h> @@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) return ret; } +EXPORT_SYMBOL_GPL(stop_machine_run); diff --git a/kernel/sys.c b/kernel/sys.c index 123b165080e..926bf9d7ac4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { - int ret = software_suspend(); + int ret = pm_suspend(PM_SUSPEND_DISK); unlock_kernel(); return ret; } @@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (retval) return retval; + if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim.rlim_cur = 1; + } + task_lock(current->group_leader); *old_rlim = new_rlim; task_unlock(current->group_leader); @@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) unsigned long rlim_cur = new_rlim.rlim_cur; cputime_t cputime; - if (rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - rlim_cur = 1; - } cputime = secs_to_cputime(rlim_cur); read_lock(&tasklist_lock); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c904748f229..f0664bd5011 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; +extern int maps_protect; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -603,6 +604,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_PROC_FS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "maps_protect", + .data = &maps_protect, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ad7d2392cb0..906cae77158 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -524,9 +524,7 @@ void __init taskstats_init_early(void) { unsigned int i; - taskstats_cache = kmem_cache_create("taskstats_cache", - sizeof(struct taskstats), - 0, SLAB_PANIC, NULL, NULL); + taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); for_each_possible_cpu(i) { INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); init_rwsem(&(per_cpu(listener_array, i).sem)); diff --git a/kernel/time.c b/kernel/time.c index ba18ec4899b..f04791f6940 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -31,7 +31,6 @@ #include <linux/timex.h> #include <linux/capability.h> #include <linux/errno.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> @@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb) } EXPORT_SYMBOL(current_fs_time); +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +unsigned int inline jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else + return (j * MSEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +unsigned int inline jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else + return (j * USEC_PER_SEC) / HZ; +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + /** * timespec_trunc - Truncate timespec to a granularity * @t: Timespec @@ -473,36 +502,6 @@ struct timeval ns_to_timeval(const s64 nsec) EXPORT_SYMBOL(ns_to_timeval); /* - * Convert jiffies to milliseconds and back. - * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases: - */ -unsigned int jiffies_to_msecs(const unsigned long j) -{ -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); -#else - return (j * MSEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_msecs); - -unsigned int jiffies_to_usecs(const unsigned long j) -{ -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); -#else - return (j * USEC_PER_SEC) / HZ; -#endif -} -EXPORT_SYMBOL(jiffies_to_usecs); - -/* * When we convert to jiffies then we interpret incoming values * the following way: * diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 93bccba1f26..99b6034fc86 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += ntp.o clocksource.o jiffies.o timer_list.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bfda3f7f071..a96ec9ab345 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -static int tick_do_timer_cpu = -1; +int tick_do_timer_cpu __read_mostly = -1; DEFINE_SPINLOCK(tick_device_lock); /* @@ -295,6 +295,12 @@ static void tick_shutdown(unsigned int *cpup) clockevents_exchange_device(dev, NULL); td->evtdev = NULL; } + /* Transfer the do_timer job away from this cpu */ + if (*cpup == tick_do_timer_cpu) { + int cpu = first_cpu(cpu_online_map); + + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c9d203bde51..bb13f272490 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 51556b95f60..3483e6cb954 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { + if (select_nohz_load_balancer(1)) { + /* + * sched tick not stopped! + */ + cpu_clear(cpu, nohz_cpu_mask); + goto out; + } + ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; } + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. + */ + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = -1; + /* * calculate the expiry time for the next timer wheel * timer @@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void) now = ktime_get(); local_irq_disable(); + select_nohz_load_balancer(0); tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); @@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); + int cpu = smp_processor_id(); ktime_t now = ktime_get(); dev->next_event.tv64 = KTIME_MAX; + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; + /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * When we are idle and the tick is stopped, we have to touch @@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) struct hrtimer_cpu_base *base = timer->base->cpu_base; struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == -1)) + tick_do_timer_cpu = cpu; +#endif /* Check, if the jiffies need an update */ - tick_do_update_jiffies64(now); + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); /* * Do not call, when we are not in irq context and have diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 00000000000..f9217bf644f --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,476 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/sysdev.h> +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/tick.h> + + +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime and avenrun. + */ +__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + +EXPORT_SYMBOL(xtime_lock); + + +/* + * The current time + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + */ +struct timespec xtime __attribute__ ((aligned (16))); +struct timespec wall_to_monotonic __attribute__ ((aligned (16))); + +EXPORT_SYMBOL(xtime); + + +static struct clocksource *clock; /* pointer to current clocksource */ + + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + clock->error = 0; + ntp_clear(); + + update_vsyscall(&xtime, clock); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static void change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + + new = clocksource_get_next(); + + if (clock == new) + return; + + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + + tick_clock_notify(); + + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); +} +#else +static inline void change_clocksource(void) { } +#endif + +/** + * timekeeping_is_continuous - check to see if timekeeping is free running + */ +int timekeeping_is_continuous(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/** + * read_persistent_clock - Return time in seconds from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Returns seconds from epoch using the battery backed persistent clock. + * Returns zero if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +unsigned long __attribute__((weak)) read_persistent_clock(void) +{ + return 0; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + unsigned long sec = read_persistent_clock(); + + write_seqlock_irqsave(&xtime_lock, flags); + + ntp_clear(); + + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); + clock->cycle_last = clocksource_read(clock); + + xtime.tv_sec = sec; + xtime.tv_nsec = 0; + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} + +/* flag for if timekeeping is suspended */ +static int timekeeping_suspended; +/* time in seconds when suspend began */ +static unsigned long timekeeping_suspend_time; + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + unsigned long now = read_persistent_clock(); + + write_seqlock_irqsave(&xtime_lock, flags); + + if (now && (now > timekeeping_suspend_time)) { + unsigned long sleep_length = now - timekeeping_suspend_time; + + xtime.tv_sec += sleep_length; + wall_to_monotonic.tv_sec -= sleep_length; + } + /* re-base the last cycle value */ + clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); + + /* Resume hrtimers */ + hres_timers_resume(); + + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; + timekeeping_suspend_time = read_persistent_clock(); + write_sequnlock_irqrestore(&xtime_lock, flags); + + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead even further + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, + s64 *offset) +{ + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; + + /* + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). + */ + error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; + + /* + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. + */ + tick_error = current_tick_length() >> + (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error -= clock->xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; + } + for (adj = 0; error > i; adj++) + error >>= 1; + + *interval <<= adj; + *offset <<= adj; + return mult << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = clocksource_bigadjust(error, &interval, &offset); + } else if (error < -interval) { + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = clocksource_bigadjust(error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << + (TICK_LENGTH_SHIFT - clock->shift); +} + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +void update_wall_time(void) +{ + cycle_t offset; + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; + xtime.tv_sec++; + second_overflow(); + } + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + change_clocksource(); + update_vsyscall(&xtime, clock); +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 59df5e8555a..b734ca4bc75 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); static void print_name_offset(struct seq_file *m, void *sym) { - unsigned long addr = (unsigned long)sym; - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - SEQ_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) SEQ_printf(m, "<%p>", sym); + else + SEQ_printf(m, "%s", symname); } static void diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1bc4882e28e..868f1bceb07 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static void print_name_offset(struct seq_file *m, unsigned long addr) { - char namebuf[KSYM_NAME_LEN+1]; - unsigned long size, offset; - const char *sym_name; - char *modname; - - sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf); - if (sym_name) - seq_printf(m, "%s", sym_name); - else + char symname[KSYM_NAME_LEN+1]; + + if (lookup_symbol_name(addr, symname) < 0) seq_printf(m, "<%p>", (void *)addr); + else + seq_printf(m, "%s", symname); } static int tstats_show(struct seq_file *m, void *v) diff --git a/kernel/timer.c b/kernel/timer.c index b22bd39740d..7a6448340f9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1,7 +1,7 @@ /* * linux/kernel/timer.c * - * Kernel internal timers, kernel timekeeping, basic process system calls + * Kernel internal timers, basic process system calls * * Copyright (C) 1991, 1992 Linus Torvalds * @@ -74,7 +74,7 @@ struct tvec_t_base_s { tvec_t tv3; tvec_t tv4; tvec_t tv5; -} ____cacheline_aligned_in_smp; +} ____cacheline_aligned; typedef struct tvec_t_base_s tvec_base_t; @@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; +/* + * Note that all tvec_bases is 2 byte aligned and lower bit of + * base in timer_list is guaranteed to be zero. Use the LSB for + * the new flag to indicate whether the timer is deferrable + */ +#define TBASE_DEFERRABLE_FLAG (0x1) + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(tvec_base_t *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline tvec_base_t *tbase_get_base(tvec_base_t *base) +{ + return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | + TBASE_DEFERRABLE_FLAG)); +} + +static inline void +timer_set_base(struct timer_list *timer, tvec_base_t *new_base) +{ + timer->base = (tvec_base_t *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} + /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded @@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer) } EXPORT_SYMBOL(init_timer); +void fastcall init_timer_deferrable(struct timer_list *timer) +{ + init_timer(timer); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable); + static inline void detach_timer(struct timer_list *timer, int clear_pending) { @@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, tvec_base_t *base; for (;;) { - base = timer->base; + tvec_base_t *prelock_base = timer->base; + base = tbase_get_base(prelock_base); if (likely(base != NULL)) { spin_lock_irqsave(&base->lock, *flags); - if (likely(base == timer->base)) + if (likely(prelock_base == timer->base)) return base; /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); @@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer->base = NULL; + timer_set_base(timer, NULL); spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->base = base; + timer_set_base(timer, base); } } @@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu) timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer->base = base; + timer_set_base(timer, base); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); } @@ -550,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * don't have to detach them individually. */ list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(timer->base != base); + BUG_ON(tbase_get_base(timer->base) != base); internal_add_timer(base, timer); } @@ -590,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base) void (*fn)(unsigned long); unsigned long data; - timer = list_entry(head->next,struct timer_list,entry); + timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; @@ -636,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base) index = slot = timer_jiffies & TVR_MASK; do { list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + found = 1; expires = nte->expires; /* Look at the cascade bucket(s)? */ @@ -752,455 +794,6 @@ unsigned long next_timer_interrupt(void) #endif -/******************************************************************/ - -/* - * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); - -EXPORT_SYMBOL(xtime); - - -/* XXX - all of this timekeeping code should be later moved to time.c */ -#include <linux/clocksource.h> -static struct clocksource *clock; /* pointer to current clocksource */ - -#ifdef CONFIG_GENERIC_TIME -/** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook - * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) - */ -static inline s64 __get_nsec_offset(void) -{ - cycle_t cycle_now, cycle_delta; - s64 ns_offset; - - /* read clocksource: */ - cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ - cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); - - return ns_offset; -} - -/** - * __get_realtime_clock_ts - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. Used by - * do_gettimeofday() and get_realtime_clock_ts(). - */ -static inline void __get_realtime_clock_ts(struct timespec *ts) -{ - unsigned long seq; - s64 nsecs; - - do { - seq = read_seqbegin(&xtime_lock); - - *ts = xtime; - nsecs = __get_nsec_offset(); - - } while (read_seqretry(&xtime_lock, seq)); - - timespec_add_ns(ts, nsecs); -} - -/** - * getnstimeofday - Returns the time of day in a timespec - * @ts: pointer to the timespec to be set - * - * Returns the time of day in a timespec. - */ -void getnstimeofday(struct timespec *ts) -{ - __get_realtime_clock_ts(ts); -} - -EXPORT_SYMBOL(getnstimeofday); - -/** - * do_gettimeofday - Returns the time of day in a timeval - * @tv: pointer to the timeval to be set - * - * NOTE: Users should be converted to using get_realtime_clock_ts() - */ -void do_gettimeofday(struct timeval *tv) -{ - struct timespec now; - - __get_realtime_clock_ts(&now); - tv->tv_sec = now.tv_sec; - tv->tv_usec = now.tv_nsec/1000; -} - -EXPORT_SYMBOL(do_gettimeofday); -/** - * do_settimeofday - Sets the time of day - * @tv: pointer to the timespec variable containing the new time - * - * Sets the time of day to the new time and update NTP and notify hrtimers - */ -int do_settimeofday(struct timespec *tv) -{ - unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - nsec -= __get_nsec_offset(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - clock->error = 0; - ntp_clear(); - - update_vsyscall(&xtime, clock); - - write_sequnlock_irqrestore(&xtime_lock, flags); - - /* signal hrtimers about time change */ - clock_was_set(); - - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -/** - * change_clocksource - Swaps clocksources if a new one is available - * - * Accumulates current time interval and initializes new clocksource - */ -static void change_clocksource(void) -{ - struct clocksource *new; - cycle_t now; - u64 nsec; - - new = clocksource_get_next(); - - if (clock == new) - return; - - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); - - clock = new; - clock->cycle_last = now; - - clock->error = 0; - clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - - tick_clock_notify(); - - printk(KERN_INFO "Time: %s clocksource has been installed.\n", - clock->name); -} -#else -static inline void change_clocksource(void) { } -#endif - -/** - * timekeeping_is_continuous - check to see if timekeeping is free running - */ -int timekeeping_is_continuous(void) -{ - unsigned long seq; - int ret; - - do { - seq = read_seqbegin(&xtime_lock); - - ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - - } while (read_seqretry(&xtime_lock, seq)); - - return ret; -} - -/** - * read_persistent_clock - Return time in seconds from the persistent clock. - * - * Weak dummy function for arches that do not yet support it. - * Returns seconds from epoch using the battery backed persistent clock. - * Returns zero if unsupported. - * - * XXX - Do be sure to remove it once all arches implement it. - */ -unsigned long __attribute__((weak)) read_persistent_clock(void) -{ - return 0; -} - -/* - * timekeeping_init - Initializes the clocksource and common timekeeping values - */ -void __init timekeeping_init(void) -{ - unsigned long flags; - unsigned long sec = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - ntp_clear(); - - clock = clocksource_get_next(); - clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); - clock->cycle_last = clocksource_read(clock); - - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - write_sequnlock_irqrestore(&xtime_lock, flags); -} - -/* flag for if timekeeping is suspended */ -static int timekeeping_suspended; -/* time in seconds when suspend began */ -static unsigned long timekeeping_suspend_time; - -/** - * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. - */ -static int timekeeping_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long now = read_persistent_clock(); - - write_seqlock_irqsave(&xtime_lock, flags); - - if (now && (now > timekeeping_suspend_time)) { - unsigned long sleep_length = now - timekeeping_suspend_time; - - xtime.tv_sec += sleep_length; - wall_to_monotonic.tv_sec -= sleep_length; - } - /* re-base the last cycle value */ - clock->cycle_last = clocksource_read(clock); - clock->error = 0; - timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - - touch_softlockup_watchdog(); - - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ - hres_timers_resume(); - - return 0; -} - -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - write_seqlock_irqsave(&xtime_lock, flags); - timekeeping_suspended = 1; - timekeeping_suspend_time = read_persistent_clock(); - write_sequnlock_irqrestore(&xtime_lock, flags); - - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); - - return 0; -} - -/* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, - set_kset_name("timekeeping"), -}; - -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) -{ - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(timekeeping_init_device); - -/* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. - */ -static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, - s64 *offset) -{ - s64 tick_error, i; - u32 look_ahead, adj; - s32 error2, mult; - - /* - * Use the current error value to determine how much to look ahead. - * The larger the error the slower we adjust for it to avoid problems - * with losing too many ticks, otherwise we would overadjust and - * produce an even larger error. The smaller the adjustment the - * faster we try to adjust for it, as lost ticks can do less harm - * here. This is tuned so that an error of about 1 msec is adusted - * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). - */ - error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); - error2 = abs(error2); - for (look_ahead = 0; error2 > 0; look_ahead++) - error2 >>= 2; - - /* - * Now calculate the error in (1 << look_ahead) ticks, but first - * remove the single look ahead already included in the error. - */ - tick_error = current_tick_length() >> - (TICK_LENGTH_SHIFT - clock->shift + 1); - tick_error -= clock->xtime_interval >> 1; - error = ((error - tick_error) >> look_ahead) + tick_error; - - /* Finally calculate the adjustment shift value. */ - i = *interval; - mult = 1; - if (error < 0) { - error = -error; - *interval = -*interval; - *offset = -*offset; - mult = -1; - } - for (adj = 0; error > i; adj++) - error >>= 1; - - *interval <<= adj; - *offset <<= adj; - return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void clocksource_adjust(struct clocksource *clock, s64 offset) -{ - s64 error, interval = clock->cycle_interval; - int adj; - - error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); - if (error > interval) { - error >>= 2; - if (likely(error <= interval)) - adj = 1; - else - adj = clocksource_bigadjust(error, &interval, &offset); - } else if (error < -interval) { - error >>= 2; - if (likely(error >= -interval)) { - adj = -1; - interval = -interval; - offset = -offset; - } else - adj = clocksource_bigadjust(error, &interval, &offset); - } else - return; - - clock->mult += adj; - clock->xtime_interval += interval; - clock->xtime_nsec -= offset; - clock->error -= (interval - offset) << - (TICK_LENGTH_SHIFT - clock->shift); -} - -/** - * update_wall_time - Uses the current clocksource to increment the wall time - * - * Called from the timer interrupt, must hold a write on xtime_lock. - */ -static void update_wall_time(void) -{ - cycle_t offset; - - /* Make sure we're fully resumed: */ - if (unlikely(timekeeping_suspended)) - return; - -#ifdef CONFIG_GENERIC_TIME - offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; -#else - offset = clock->cycle_interval; -#endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; - - /* normally this loop will run just once, however in the - * case of lost or late ticks, it will accumulate correctly. - */ - while (offset >= clock->cycle_interval) { - /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; - offset -= clock->cycle_interval; - - if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { - clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; - xtime.tv_sec++; - second_overflow(); - } - - /* interpolator bits */ - time_interpolator_update(clock->xtime_interval - >> clock->shift); - - /* accumulate error between NTP and clock interval */ - clock->error += current_tick_length(); - clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); - } - - /* correct the clock when NTP error is too big */ - clocksource_adjust(clock, offset); - - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; - clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; - - /* check to see if there is a new clocksource to use */ - change_clocksource(); - update_vsyscall(&xtime, clock); -} - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -1264,14 +857,6 @@ static inline void calc_load(unsigned long ticks) } /* - * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. - */ -__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - -EXPORT_SYMBOL(xtime_lock); - -/* * This function runs timers and the timer-tq in bottom half context. */ static void run_timer_softirq(struct softirq_action *h) @@ -1617,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu) cpu_to_node(cpu)); if (!base) return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } memset(base, 0, sizeof(*base)); per_cpu(tvec_bases, cpu) = base; } else { @@ -1656,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) struct timer_list *timer; while (!list_empty(head)) { - timer = list_entry(head->next, struct timer_list, entry); + timer = list_first_entry(head, struct timer_list, entry); detach_timer(timer, 0); - timer->base = new_base; + timer_set_base(timer, new_base); internal_add_timer(new_base, timer); } } diff --git a/kernel/uid16.c b/kernel/uid16.c index 187e2a42387..dd308ba4e03 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -6,7 +6,6 @@ #include <linux/mm.h> #include <linux/utsname.h> #include <linux/mman.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index c859164a699..160c8c5136b 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) } /* - * unshare the current process' utsname namespace. - * called only in sys_unshare() - */ -int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) -{ - if (unshare_flags & CLONE_NEWUTS) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - *new_uts = clone_uts_ns(current->nsproxy->uts_ns); - if (!*new_uts) - return -ENOMEM; - } - - return 0; -} - -/* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the * utsname of this process won't be seen by parent, and vice * versa. */ -int copy_utsname(int flags, struct task_struct *tsk) +struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns) { - struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; - int err = 0; - - if (!old_ns) - return 0; + BUG_ON(!old_ns); get_uts_ns(old_ns); if (!(flags & CLONE_NEWUTS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; - } + return old_ns; new_ns = clone_uts_ns(old_ns); - if (!new_ns) { - err = -ENOMEM; - goto out; - } - tsk->nsproxy->uts_ns = new_ns; -out: put_uts_ns(old_ns); - return err; + return new_ns; } void free_uts_ns(struct kref *kref) |