diff options
Diffstat (limited to 'kernel')
96 files changed, 7783 insertions, 6575 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472fbc27..0b72d1a74be 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ -obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o @@ -99,8 +100,6 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/acct.c b/kernel/acct.c index 385b88461c2..fa7eb3de2dd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(file->f_path.dentry, &sbuf)) + if (vfs_statfs(&file->f_path, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; diff --git a/kernel/async.c b/kernel/async.c index 15319d6c18f..cd9dbb913c7 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. */ #include <linux/async.h> -#include <linux/bug.h> #include <linux/module.h> #include <linux/wait.h> #include <linux/sched.h> -#include <linux/init.h> -#include <linux/kthread.h> -#include <linux/delay.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; -#define MAX_THREADS 256 #define MAX_WORK 32768 static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); -static int async_enabled = 0; - struct async_entry { - struct list_head list; - async_cookie_t cookie; - async_func_ptr *func; - void *data; - struct list_head *running; + struct list_head list; + struct work_struct work; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); -static DECLARE_WAIT_QUEUE_HEAD(async_new); static atomic_t entry_count; -static atomic_t thread_count; extern int initcall_debug; @@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) spin_unlock_irqrestore(&async_lock, flags); return ret; } + /* * pick the first pending entry and run it */ -static void run_one_entry(void) +static void async_run_entry_fn(struct work_struct *work) { + struct async_entry *entry = + container_of(work, struct async_entry, work); unsigned long flags; - struct async_entry *entry; ktime_t calltime, delta, rettime; - /* 1) pick one task from the pending queue */ - + /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - if (list_empty(&async_pending)) - goto out; - entry = list_first_entry(&async_pending, struct async_entry, list); - - /* 2) move it to the running queue */ list_move_tail(&entry->list, entry->running); spin_unlock_irqrestore(&async_lock, flags); - /* 3) run it (and print duration)*/ + /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); @@ -153,31 +142,25 @@ static void run_one_entry(void) (long long)ktime_to_ns(delta) >> 10); } - /* 4) remove it from the running queue */ + /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - /* 5) free the entry */ + /* 4) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 6) wake up any waiters. */ + /* 5) wake up any waiters */ wake_up(&async_done); - return; - -out: - spin_unlock_irqrestore(&async_lock, flags); } - static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) { struct async_entry *entry; unsigned long flags; async_cookie_t newcookie; - /* allow irq-off callers */ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); @@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + if (!entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l ptr(data, newcookie); return newcookie; } + INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; entry->running = running; @@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l list_add_tail(&entry->list, &async_pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - wake_up(&async_new); + + /* schedule for execution */ + queue_work(system_unbound_wq, &entry->work); + return newcookie; } @@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); - - -static int async_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int ret = HZ; - set_current_state(TASK_INTERRUPTIBLE); - /* - * check the list head without lock.. false positives - * are dealt with inside run_one_entry() while holding - * the lock. - */ - rmb(); - if (!list_empty(&async_pending)) - run_one_entry(); - else - ret = schedule_timeout(HZ); - - if (ret == 0) { - /* - * we timed out, this means we as thread are redundant. - * we sign off and die, but we to avoid any races there - * is a last-straw check to see if work snuck in. - */ - atomic_dec(&thread_count); - wmb(); /* manager must see our departure first */ - if (list_empty(&async_pending)) - break; - /* - * woops work came in between us timing out and us - * signing off; we need to stay alive and keep working. - */ - atomic_inc(&thread_count); - } - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int async_manager_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int tc, ec; - - set_current_state(TASK_INTERRUPTIBLE); - - tc = atomic_read(&thread_count); - rmb(); - ec = atomic_read(&entry_count); - - while (tc < ec && tc < MAX_THREADS) { - if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", - tc))) { - msleep(100); - continue; - } - atomic_inc(&thread_count); - tc++; - } - - schedule(); - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int __init async_init(void) -{ - async_enabled = - !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); - - WARN_ON(!async_enabled); - return 0; -} - -core_initcall(async_init); diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26631a..d96045789b5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,7 +56,6 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> -#include <linux/inotify.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/audit.h b/kernel/audit.h index 208687be4f3..f7206db4e13 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); + /* audit watch functions */ -extern unsigned long audit_watch_inode(struct audit_watch *watch); -extern dev_t audit_watch_dev(struct audit_watch *watch); +#ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); -extern int audit_add_watch(struct audit_krule *krule); -extern void audit_remove_watch(struct audit_watch *watch); -extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); -extern void audit_inotify_unregister(struct list_head *in_list); +extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); +extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern struct list_head *audit_watch_rules(struct audit_watch *watch); - -extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +#else +#define audit_put_watch(w) {} +#define audit_get_watch(w) {} +#define audit_to_watch(k, p, l, o) (-EINVAL) +#define audit_add_watch(k, l) (-EINVAL) +#define audit_remove_watch_rule(k) BUG() +#define audit_watch_path(w) "" +#define audit_watch_compare(w, i, d) 0 + +#endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 46a57b57a33..7f18d3a4527 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1,5 +1,5 @@ #include "audit.h" -#include <linux/inotify.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> @@ -22,7 +22,7 @@ struct audit_tree { struct audit_chunk { struct list_head hash; - struct inotify_watch watch; + struct fsnotify_mark mark; struct list_head trees; /* with root here */ int dead; int count; @@ -59,7 +59,7 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. @@ -68,7 +68,7 @@ static LIST_HEAD(prune_list); * that makes a difference. Some. */ -static struct inotify_handle *rtree_ih; +static struct fsnotify_group *audit_tree_group; static struct audit_tree *alloc_tree(const char *s) { @@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree) return tree->pathname; } -static struct audit_chunk *alloc_chunk(int count) -{ - struct audit_chunk *chunk; - size_t size; - int i; - - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); - if (!chunk) - return NULL; - - INIT_LIST_HEAD(&chunk->hash); - INIT_LIST_HEAD(&chunk->trees); - chunk->count = count; - atomic_long_set(&chunk->refs, 1); - for (i = 0; i < count; i++) { - INIT_LIST_HEAD(&chunk->owners[i].list); - chunk->owners[i].index = i; - } - inotify_init_watch(&chunk->watch); - return chunk; -} - static void free_chunk(struct audit_chunk *chunk) { int i; @@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu) audit_put_chunk(chunk); } +static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +{ + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + call_rcu(&chunk->head, __put_chunk); +} + +static struct audit_chunk *alloc_chunk(int count) +{ + struct audit_chunk *chunk; + size_t size; + int i; + + size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); + chunk = kzalloc(size, GFP_KERNEL); + if (!chunk) + return NULL; + + INIT_LIST_HEAD(&chunk->hash); + INIT_LIST_HEAD(&chunk->trees); + chunk->count = count; + atomic_long_set(&chunk->refs, 1); + for (i = 0; i < count; i++) { + INIT_LIST_HEAD(&chunk->owners[i].list); + chunk->owners[i].index = i; + } + fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + return chunk; +} + enum {HASH_SIZE = 128}; static struct list_head chunk_hash_heads[HASH_SIZE]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); @@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode) return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock is held by caller */ +/* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - struct list_head *list = chunk_hash(chunk->watch.inode); + struct fsnotify_mark *entry = &chunk->mark; + struct list_head *list; + + if (!entry->i.inode) + return; + list = chunk_hash(entry->i.inode); list_add_rcu(&chunk->hash, list); } @@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (p->watch.inode == inode) { + /* mark.inode may have gone NULL, but who cares? */ + if (p->mark.i.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p) static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); + struct fsnotify_mark *entry = &chunk->mark; struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; - if (!pin_inotify_watch(&chunk->watch)) { - /* - * Filesystem is shutting down; all watches are getting - * evicted, just take it off the node list for this - * tree and let the eviction logics take care of the - * rest. - */ - owner = p->owner; - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - return; - } + fsnotify_get_mark(entry); spin_unlock(&hash_lock); - /* - * pin_inotify_watch() succeeded, so the watch won't go away - * from under us. - */ - mutex_lock(&chunk->watch.inode->inotify_mutex); - if (chunk->dead) { - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_lock(&entry->lock); + if (chunk->dead || !entry->i.inode) { + spin_unlock(&entry->lock); goto out; } @@ -256,16 +249,17 @@ static void untag_chunk(struct node *p) list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; } new = alloc_chunk(size); if (!new) goto Fallback; - if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { + fsnotify_duplicate_mark(&new->mark, entry); + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); goto Fallback; } @@ -298,9 +292,9 @@ static void untag_chunk(struct node *p) list_for_each_entry(owner, &new->trees, same_root) owner->root = new; spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; Fallback: @@ -314,31 +308,33 @@ Fallback: p->owner = NULL; put_tree(owner); spin_unlock(&hash_lock); - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_unlock(&entry->lock); out: - unpin_inotify_watch(&chunk->watch); + fsnotify_put_mark(entry); spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) { + struct fsnotify_mark *entry; struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) return -ENOMEM; - if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { + entry = &chunk->mark; + if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { free_chunk(chunk); return -ENOSPC; } - mutex_lock(&inode->inotify_mutex); + spin_lock(&entry->lock); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); return 0; } chunk->owners[0].index = (1U << 31); @@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) } insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&inode->inotify_mutex); + spin_unlock(&entry->lock); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct inotify_watch *watch; + struct fsnotify_mark *old_entry, *chunk_entry; struct audit_tree *owner; struct audit_chunk *chunk, *old; struct node *p; int n; - if (inotify_find_watch(rtree_ih, inode, &watch) < 0) + old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + if (!old_entry) return create_chunk(inode, tree); - old = container_of(watch, struct audit_chunk, watch); + old = container_of(old_entry, struct audit_chunk, mark); /* are we already there? */ spin_lock(&hash_lock); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return 0; } } @@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return -ENOMEM; } - mutex_lock(&inode->inotify_mutex); - if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + chunk_entry = &chunk->mark; + + spin_lock(&old_entry->lock); + if (!old_entry->i.inode) { + /* old_entry is being shot, lets just lie */ + spin_unlock(&old_entry->lock); + fsnotify_put_mark(old_entry); free_chunk(chunk); + return -ENOENT; + } + + fsnotify_duplicate_mark(chunk_entry, old_entry); + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + spin_unlock(&old_entry->lock); + free_chunk(chunk); + fsnotify_put_mark(old_entry); return -ENOSPC; } + + /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ + spin_lock(&chunk_entry->lock); spin_lock(&hash_lock); + + /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); - put_inotify_watch(&chunk->watch); + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + + fsnotify_destroy_mark(chunk_entry); + + fsnotify_put_mark(chunk_entry); + fsnotify_put_mark(old_entry); return 0; } list_replace_init(&old->trees, &chunk->trees); @@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); - inotify_evict_watch(&old->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ - put_inotify_watch(&old->watch); /* and kill it */ + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -584,7 +601,9 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct inode *inode = find_chunk(node)->watch.inode; + struct audit_chunk *chunk = find_chunk(node); + /* this could be NULL if the watch is dieing else where... */ + struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); @@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list) * Here comes the stuff asynchronous to auditctl operations */ -/* inode->inotify_mutex is locked */ static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; @@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk) mutex_unlock(&audit_filter_mutex); } -static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static int audit_tree_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmonut_mark, + struct fsnotify_event *event) +{ + BUG(); + return -EOPNOTSUPP; +} + +static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - if (mask & IN_IGNORED) { - evict_chunk(chunk); - put_inotify_watch(watch); - } + evict_chunk(chunk); + fsnotify_put_mark(entry); } -static void destroy_watch(struct inotify_watch *watch) +static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - call_rcu(&chunk->head, __put_chunk); + return false; } -static const struct inotify_operations rtree_inotify_ops = { - .handle_event = handle_event, - .destroy_watch = destroy_watch, +static const struct fsnotify_ops audit_tree_ops = { + .handle_event = audit_tree_handle_event, + .should_send_event = audit_tree_send_event, + .free_group_priv = NULL, + .free_event_priv = NULL, + .freeing_mark = audit_tree_freeing_mark, }; static int __init audit_tree_init(void) { int i; - rtree_ih = inotify_init(&rtree_inotify_ops); - if (IS_ERR(rtree_ih)) - audit_panic("cannot initialize inotify handle for rectree watches"); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + if (IS_ERR(audit_tree_group)) + audit_panic("cannot initialize fsnotify group for rectree watches"); for (i = 0; i < HASH_SIZE; i++) INIT_LIST_HEAD(&chunk_hash_heads[i]); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8df43696f4b..6bf2306be7d 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -24,18 +24,18 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/fs.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/inotify.h> #include <linux/security.h> #include "audit.h" /* * Reference counting: * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED * event. Each audit_watch holds a reference to its associated parent. * * audit_watch: if added to lists, lifetime is from audit_init_watch() to @@ -51,40 +51,61 @@ struct audit_watch { unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ + struct list_head rules; /* anchor for krule->rlist */ }; struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ + struct list_head watches; /* anchor for audit_watch->wlist */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ }; -/* Inotify handle. */ -struct inotify_handle *audit_ih; +/* fsnotify handle. */ +struct fsnotify_group *audit_watch_group; -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 +/* fsnotify events we care about. */ +#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF +static void audit_free_parent(struct audit_parent *parent) +{ + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} -static void audit_free_parent(struct inotify_watch *i_watch) +static void audit_watch_free_mark(struct fsnotify_mark *entry) { struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); + parent = container_of(entry, struct audit_parent, mark); + audit_free_parent(parent); +} + +static void audit_get_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_get_mark(&parent->mark); +} + +static void audit_put_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_put_mark(&parent->mark); +} + +/* + * Find and return the audit_parent on the given inode. If found a reference + * is taken on this parent. + */ +static inline struct audit_parent *audit_find_parent(struct inode *inode) +{ + struct audit_parent *parent = NULL; + struct fsnotify_mark *entry; + + entry = fsnotify_find_inode_mark(audit_watch_group, inode); + if (entry) + parent = container_of(entry, struct audit_parent, mark); + + return parent; } void audit_get_watch(struct audit_watch *watch) @@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch) void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); + audit_put_parent(watch->parent); watch->parent = NULL; audit_put_watch(watch); /* match initial get */ } @@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -struct list_head *audit_watch_rules(struct audit_watch *watch) -{ - return &watch->rules; -} - -unsigned long audit_watch_inode(struct audit_watch *watch) +int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return watch->ino; -} - -dev_t audit_watch_dev(struct audit_watch *watch) -{ - return watch->dev; + return (watch->ino != (unsigned long)-1) && + (watch->ino == ino) && + (watch->dev == dev); } /* Initialize a parent watch entry. */ static struct audit_parent *audit_init_parent(struct nameidata *ndp) { + struct inode *inode = ndp->path.dentry->d_inode; struct audit_parent *parent; - s32 wd; + int ret; parent = kzalloc(sizeof(*parent), GFP_KERNEL); if (unlikely(!parent)) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); + + fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + parent->mark.mask = AUDIT_FS_WATCH; + ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + if (ret < 0) { + audit_free_parent(parent); + return ERR_PTR(ret); } return parent; @@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; - if (!audit_ih) + if (!audit_watch_group) return -EOPNOTSUPP; if (path[0] != '/' || path[len-1] == '/' || @@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) new->dev = old->dev; new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); + audit_get_parent(old->parent); new->parent = old->parent; out: @@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_entry *oentry, *nentry; mutex_lock(&audit_filter_mutex); + /* Run all of the watches on this parent looking for the one that + * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { if (audit_compare_dname_path(dname, owatch->path, NULL)) continue; /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) + if (invalidating && !audit_dummy_context()) audit_filter_inodes(current, current->audit_context); + /* updating ino will likely change which audit_hash_list we + * are on so we need a new watch for the new list */ nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { mutex_unlock(&audit_filter_mutex); @@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.rlist); list_del_rcu(&oentry->list); - nentry = audit_dupe_rule(&oentry->rule, nwatch); + nentry = audit_dupe_rule(&oentry->rule); if (IS_ERR(nentry)) { list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { int h = audit_hash_ino((u32)ino); + + /* + * nentry->rule.watch == oentry->rule.watch so + * we must drop that reference and set it to our + * new watch. + */ + audit_put_watch(nentry->rule.watch); + audit_get_watch(nwatch); + nentry->rule.watch = nwatch; list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); list_replace(&oentry->rule.list, @@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_entry *e; mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); @@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_remove_watch(w); } mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } + fsnotify_destroy_mark(&parent->mark); } /* Get path information necessary for adding watches. */ @@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) } } -/* Associate the given rule with an existing parent inotify_watch. +/* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, struct audit_parent *parent) @@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule, struct audit_watch *w, *watch = krule->watch; int watch_found = 0; + BUG_ON(!mutex_is_locked(&audit_filter_mutex)); + list_for_each_entry(w, &parent->watches, wlist) { if (strcmp(watch->path, w->path)) continue; @@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule, } if (!watch_found) { - get_inotify_watch(&parent->wdata); + audit_get_parent(parent); watch->parent = parent; list_add(&watch->wlist, &parent->watches); @@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule, /* Find a matching watch entry, or add this one. * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule) +int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; struct audit_parent *parent; struct nameidata *ndp = NULL, *ndw = NULL; - int ret = 0; + int h, ret = 0; mutex_unlock(&audit_filter_mutex); @@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule) goto error; } + mutex_lock(&audit_filter_mutex); + /* update watch filter fields */ if (ndw) { watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; watch->ino = ndw->path.dentry->d_inode->i_ino; } - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { + /* either find an old parent or attach a new one */ + parent = audit_find_parent(ndp->path.dentry->d_inode); + if (!parent) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); ret = PTR_ERR(parent); goto error; } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); + } - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); + audit_add_to_parent(krule, parent); - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); + /* match get in audit_find_parent or audit_init_parent */ + audit_put_parent(parent); + h = audit_hash_ino((u32)watch->ino); + *list = &audit_inode_hash[h]; error: audit_put_nd(ndp, ndw); /* NULL args OK */ return ret; } -void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +void audit_remove_watch_rule(struct audit_krule *krule) { struct audit_watch *watch = krule->watch; struct audit_parent *parent = watch->parent; @@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) audit_remove_watch(watch); if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, list); + audit_get_parent(parent); + fsnotify_destroy_mark(&parent->mark); + audit_put_parent(parent); } } } -/* Update watch data in audit rules based on inotify events. */ -static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + return true; +} + +/* Update watch data in audit rules based on fsnotify events. */ +static int audit_watch_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) { + struct inode *inode; + __u32 mask = event->mask; + const char *dname = event->file_name; struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); + parent = container_of(inode_mark, struct audit_parent, mark); - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) + BUG_ON(group != audit_watch_group); + + switch (event->data_type) { + case (FSNOTIFY_EVENT_FILE): + inode = event->file->f_path.dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = event->inode; + break; + default: + BUG(); + inode = NULL; + break; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + else if (mask & (FS_DELETE|FS_MOVED_FROM)) audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { + else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); + + return 0; } -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, +static const struct fsnotify_ops audit_watch_fsnotify_ops = { + .should_send_event = audit_watch_should_send_event, + .handle_event = audit_watch_handle_event, + .free_group_priv = NULL, + .freeing_mark = NULL, + .free_event_priv = NULL, }; static int __init audit_watch_init(void) { - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + if (IS_ERR(audit_watch_group)) { + audit_watch_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } return 0; } -subsys_initcall(audit_watch_init); +device_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce08041f578..eb7675499fb 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e) { int i; struct audit_krule *erule = &e->rule; + /* some rules don't have associated watches */ if (erule->watch) audit_put_watch(erule->watch); @@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; - new->watch = NULL; new->field_count = old->field_count; + /* * note that we are OK with not refcounting here; audit_match_tree() * never dereferences tree and we can't get false positives there @@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, } } - if (watch) { - audit_get_watch(watch); - new->watch = watch; + if (old->watch) { + audit_get_watch(old->watch); + new->watch = old->watch; } return entry; @@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int h, err; + int err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry) if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule); + err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); goto error; } - /* entry->rule.watch may have changed during audit_add_watch() */ - watch = entry->rule.watch; - h = audit_hash_ino((u32)audit_watch_inode(watch)); - list = &audit_inode_hash[h]; } if (tree) { err = audit_add_tree_rule(&entry->rule); @@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - LIST_HEAD(inotify_list); int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry) } if (e->rule.watch) - audit_remove_watch_rule(&e->rule, &inotify_list); + audit_remove_watch_rule(&e->rule); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - if (!list_empty(&inotify_list)) - audit_inotify_unregister(&inotify_list); - out: if (watch) audit_put_watch(watch); /* match initial get */ @@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r) { struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; - struct audit_watch *watch; - struct audit_tree *tree; int err = 0; if (!security_audit_rule_known(r)) return 0; - watch = r->watch; - tree = r->tree; - nentry = audit_dupe_rule(r, watch); + nentry = audit_dupe_rule(r); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); - if (watch) + if (r->watch) list_del(&r->rlist); list_del_rcu(&entry->list); list_del(&r->list); } else { - if (watch) { - list_add(&nentry->rule.rlist, audit_watch_rules(watch)); - list_del(&r->rlist); - } else if (tree) + if (r->watch || r->tree) list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); list_replace(&r->list, &nentry->rule.list); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3828ad5fb8f..b87a63beb66 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -65,7 +65,6 @@ #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/inotify.h> #include <linux/capability.h> #include <linux/fs_struct.h> @@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) - result = (name->dev == audit_watch_dev(rule->watch) && - name->ino == audit_watch_inode(rule->watch)); + if (name) + result = audit_watch_compare(rule->watch, name->ino, name->dev); break; case AUDIT_DIR: if (ctx) @@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode) struct audit_tree_refs *p; struct audit_chunk *chunk; int count; - if (likely(list_empty(&inode->inotify_watches))) + if (likely(hlist_empty(&inode->i_fsnotify_marks))) return; context = current->audit_context; p = context->trees; @@ -1769,7 +1767,7 @@ retry: seq = read_seqbegin(&rename_lock); for(;;) { struct inode *inode = d->d_inode; - if (inode && unlikely(!list_empty(&inode->inotify_watches))) { + if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ac6f5b0a64..d83cab06da8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct kobject *cgroup_kobj; + static inline struct cgroup *__d_cgrp(struct dentry *dentry) { return dentry->d_fsdata; @@ -1788,6 +1790,29 @@ out: return retval; } +/** + * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * @tsk: the task to be attached + */ +int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + struct cgroupfs_root *root; + struct cgroup *cur_cg; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + cur_cg = task_cgroup_from_root(current, root); + retval = cgroup_attach_task(cur_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -3871,9 +3896,18 @@ int __init cgroup_init(void) hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); BUG_ON(!init_root_id(&rootnode)); + + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) { + err = -ENOMEM; + goto out; + } + err = register_filesystem(&cgroup_fs_type); - if (err < 0) + if (err < 0) { + kobject_put(cgroup_kobj); goto out; + } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4a..f6e726f1849 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadd..b23c0979bbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -105,7 +105,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset heirarchy */ + /* used for walking a cpuset hierarchy */ struct list_head stack_list; }; @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/cred.c b/kernel/cred.c index a2d5504fbcc..60bc8b1e32e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cb7cd1de10..3c2d4972d23 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -605,13 +605,15 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + if (error == -1) + continue; + kgdb_connected = 0; } else { error = gdb_serial_stub(ks); } if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; - kgdb_connected = 0; } else if (error == DBG_SWITCH_CPU_EVENT) { dbg_cpu_switch(cpu, dbg_switch_cpu); goto cpu_loop; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b17b326952..481a7bd2dfe 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + * GDB remote protocol parser: */ -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - #ifdef CONFIG_KGDB_KDB static int gdbstub_read_wait(void) { @@ -123,8 +112,8 @@ static void get_packet(char *buffer) buffer[count] = 0; if (ch == '#') { - xmitcsum = hex(gdbstub_read_wait()) << 4; - xmitcsum += hex(gdbstub_read_wait()); + xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; + xmitcsum += hex_to_bin(gdbstub_read_wait()); if (checksum != xmitcsum) /* failed checksum */ @@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) * buf. Return a pointer to the last char put in buf (null). May * return an error. */ -int kgdb_mem2hex(char *mem, char *buf, int count) +char *kgdb_mem2hex(char *mem, char *buf, int count) { char *tmp; int err; @@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) tmp = buf + count; err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; + if (err) + return NULL; + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; } + *buf = 0; - return err; + return buf; } /* @@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) tmp_hex = tmp_raw - 1; while (tmp_hex >= buf) { tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; + *tmp_raw = hex_to_bin(*tmp_hex--); + *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } return probe_kernel_write(mem, tmp_raw, count); @@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } while (**ptr) { - hex_val = hex(**ptr); + hex_val = hex_to_bin(**ptr); if (hex_val < 0) break; @@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) return probe_kernel_write(mem, c, size); } +#if DBG_MAX_REG_NUM > 0 +void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_get_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} + +void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_set_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Write memory due to an 'M' or 'X' packet. */ static int write_mem_msg(int binary) { @@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) * remapped to negative TIDs. */ -#define BUF_THREAD_ID_SIZE 16 +#define BUF_THREAD_ID_SIZE 8 static char *pack_threadid(char *pkt, unsigned char *id) { - char *limit; + unsigned char *limit; + int lzero = 1; + + limit = id + (BUF_THREAD_ID_SIZE / 2); + while (id < limit) { + if (!lzero || *id != 0) { + pkt = pack_hex_byte(pkt, *id); + lzero = 0; + } + id++; + } - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); + if (lzero) + pkt = pack_hex_byte(pkt, 0); return pkt; } static void int_to_threadref(unsigned char *id, int value) { - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); + put_unaligned_be32(value, id); } static struct task_struct *getthread(struct pt_regs *regs, int tid) @@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) pack_hex_byte(&remcom_out_buffer[1], ks->signo); } -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) +static void gdb_get_regs_helper(struct kgdb_state *ks) { struct task_struct *thread; void *local_debuggerinfo; @@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) */ sleeping_thread_to_gdb_regs(gdb_regs, thread); } +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + gdb_get_regs_helper(ks); kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); } @@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) char *ptr = &remcom_in_buffer[1]; unsigned long length; unsigned long addr; - int err; + char *err; if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && kgdb_hex2long(&ptr, &length) > 0) { err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); + if (!err) + error_packet(remcom_out_buffer, -EINVAL); } else { error_packet(remcom_out_buffer, -EINVAL); } @@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) strcpy(remcom_out_buffer, "OK"); } +#if DBG_MAX_REG_NUM > 0 +static char *gdb_hex_reg_helper(int regnum, char *out) +{ + int i; + int offset = 0; + + for (i = 0; i < regnum; i++) + offset += dbg_reg_def[i].size; + return kgdb_mem2hex((char *)gdb_regs + offset, out, + dbg_reg_def[i].size); +} + +/* Handle the 'p' individual regster get */ +static void gdb_cmd_reg_get(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + + kgdb_hex2long(&ptr, ®num); + if (regnum >= DBG_MAX_REG_NUM) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + gdb_get_regs_helper(ks); + gdb_hex_reg_helper(regnum, remcom_out_buffer); +} + +/* Handle the 'P' individual regster set */ +static void gdb_cmd_reg_set(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + int i = 0; + + kgdb_hex2long(&ptr, ®num); + if (*ptr++ != '=' || + !(!kgdb_usethread || kgdb_usethread == current) || + !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + memset(gdb_regs, 0, sizeof(gdb_regs)); + while (i < sizeof(gdb_regs) * 2) + if (hex_to_bin(ptr[i]) >= 0) + i++; + else + break; + i = i / 2; + kgdb_hex2mem(ptr, (char *)gdb_regs, i); + dbg_set_reg(regnum, gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Handle the 'X' memory binary write bytes */ static void gdb_cmd_binwrite(struct kgdb_state *ks) { @@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) { struct task_struct *g; struct task_struct *p; - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; int i; int cpu; @@ -621,10 +697,8 @@ static void gdb_cmd_query(struct kgdb_state *ks) switch (remcom_in_buffer[1]) { case 's': case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) break; - } i = 0; remcom_out_buffer[0] = 'm'; @@ -634,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) for_each_online_cpu(cpu) { ks->thr_query = 0; int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; i++; } @@ -644,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) do_each_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; ks->thr_query++; if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) @@ -665,10 +737,9 @@ static void gdb_cmd_query(struct kgdb_state *ks) pack_threadid(remcom_out_buffer + 2, thref); break; case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) break; - } + ks->threadid = 0; ptr = remcom_in_buffer + 17; kgdb_hex2long(&ptr, &ks->threadid); @@ -861,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) int error = 0; int tmp; - /* Clear the out buffer. */ + /* Initialize comm buffer and globals. */ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; if (kgdb_connected) { - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; /* Reply to host that an exception has occurred */ @@ -879,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) put_packet(remcom_out_buffer); } - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - while (1) { error = 0; @@ -907,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_memwrite(ks); break; +#if DBG_MAX_REG_NUM > 0 + case 'p': /* pXX Return gdb register XX (in hex) */ + gdb_cmd_reg_get(ks); + break; + case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ + gdb_cmd_reg_set(ks); + break; +#endif /* DBG_MAX_REG_NUM > 0 */ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_binwrite(ks); break; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 184cd8209c3..28b844118bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) if (endp == arg) { /* - * Try base 16, for us folks too lazy to type the + * Also try base 16, for us folks too lazy to type the * leading 0x... */ val = simple_strtoul(arg, &endp, 16); @@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) return 0; } +int kdbgetu64arg(const char *arg, u64 *value) +{ + char *endp; + u64 val; + + val = simple_strtoull(arg, &endp, 0); + + if (endp == arg) { + + val = simple_strtoull(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + /* * kdb_set - This function implements the 'set' command. Alter an * existing environment variable or create a new one. @@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int diag = kdb_check_regs(); - if (diag) - return diag; + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 + int i; + char *rname; + int rsize; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (len) + return len; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + rsize = dbg_reg_def[i].size * 2; + if (rsize > 16) + rsize = 2; + if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { + len = 0; + kdb_printf("\n"); + } + if (len) + len += kdb_printf(" "); + switch(dbg_reg_def[i].size * 8) { + case 8: + rname = dbg_get_reg(i, ®8, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %02x", rname, reg8); + break; + case 16: + rname = dbg_get_reg(i, ®16, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %04x", rname, reg16); + break; + case 32: + rname = dbg_get_reg(i, ®32, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %08x", rname, reg32); + break; + case 64: + rname = dbg_get_reg(i, ®64, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %016llx", rname, reg64); + break; + default: + len += kdb_printf("%s: ??", dbg_reg_def[i].name); + } + } + kdb_printf("\n"); +#else + if (len) + return len; kdb_dumpregs(kdb_current_regs); +#endif return 0; } @@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) * kdb_rm - This function implements the 'rm' (register modify) command. * rm register-name new-contents * Remarks: - * Currently doesn't allow modification of control or - * debug registers. + * Allows register modification with the same restrictions as gdb */ static int kdb_rm(int argc, const char **argv) { +#if DBG_MAX_REG_NUM > 0 int diag; - int ind = 0; - unsigned long contents; + const char *rname; + int i; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; if (argc != 2) return KDB_ARGCOUNT; /* * Allow presence or absence of leading '%' symbol. */ - if (argv[1][0] == '%') - ind = 1; + rname = argv[1]; + if (*rname == '%') + rname++; - diag = kdbgetularg(argv[2], &contents); + diag = kdbgetu64arg(argv[2], ®64); if (diag) return diag; diag = kdb_check_regs(); if (diag) return diag; + + diag = KDB_BADREG; + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + if (strcmp(rname, dbg_reg_def[i].name) == 0) { + diag = 0; + break; + } + } + if (!diag) { + switch(dbg_reg_def[i].size * 8) { + case 8: + reg8 = reg64; + dbg_set_reg(i, ®8, kdb_current_regs); + break; + case 16: + reg16 = reg64; + dbg_set_reg(i, ®16, kdb_current_regs); + break; + case 32: + reg32 = reg64; + dbg_set_reg(i, ®32, kdb_current_regs); + break; + case 64: + dbg_set_reg(i, ®64, kdb_current_regs); + break; + } + } + return diag; +#else kdb_printf("ERROR: Register set currently not implemented\n"); - return 0; + return 0; +#endif } #if defined(CONFIG_MAGIC_SYSRQ) @@ -1820,9 +1928,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], NULL, 0); kdb_trap_printk--; return 0; @@ -1883,6 +1990,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); #ifdef CONFIG_MODULE_UNLOAD { @@ -2291,6 +2399,9 @@ static int kdb_ll(int argc, const char **argv) while (va) { char buf[80]; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) @@ -2437,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { + struct timespec now; struct kdb_tm tm; struct sysinfo val; @@ -2451,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - kdb_gmtime(&xtime, &tm); + now = __current_kernel_time(); + kdb_gmtime(&now, &tm); kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba69775..c438f545a32 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); -extern int kdb_set(int, const char **); extern char *kdbgetenv(const char *); -extern int kdbgetintenv(const char *, int *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3..7bfae887f21 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include <linux/bootmem.h> #include <linux/mm.h> #include <linux/early_res.h> +#include <linux/slab.h> +#include <linux/kmemleak.h> /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index dd62f8e714c..0dbeae37422 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -134,23 +134,14 @@ unregister: return 0; } -int -__set_personality(unsigned int personality) +int __set_personality(unsigned int personality) { - struct exec_domain *ep, *oep; - - ep = lookup_exec_domain(personality); - if (ep == current_thread_info()->exec_domain) { - current->personality = personality; - module_put(ep->module); - return 0; - } + struct exec_domain *oep = current_thread_info()->exec_domain; + current_thread_info()->exec_domain = lookup_exec_domain(personality); current->personality = personality; - oep = current_thread_info()->exec_domain; - current_thread_info()->exec_domain = ep; - module_put(oep->module); + return 0; } @@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) { unsigned int old = current->personality; - if (personality != 0xffffffff) { + if (personality != 0xffffffff) set_personality(personality); - if (current->personality != personality) - return -EINVAL; - } return old; } diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba04..98b450876f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; + sig->oom_score_adj = current->signal->oom_score_adj; return 0; } @@ -907,7 +908,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0..ce669174f35 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); xts = __current_kernel_time(); - tom = wall_to_monotonic; + tom = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } @@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset; + struct timespec realtime_offset, wtm; unsigned long seq; if (!hrtimer_hres_active()) @@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) do { seq = read_seqbegin(&xtime_lock); - set_normalized_timespec(&realtime_offset, - -wall_to_monotonic.tv_sec, - -wall_to_monotonic.tv_nsec); + wtm = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22e060..d71a987fd2b 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -41,6 +41,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ @@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) return 0; } -static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = tsk->perf_event_ctxp; - struct list_head *list; - struct perf_event *bp; - unsigned long flags; + struct perf_event_context *ctx = bp->ctx; + struct perf_event *iter; int count = 0; - if (WARN_ONCE(!ctx, "No perf context for this task")) - return 0; - - list = &ctx->event_list; - - raw_spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - if (find_slot_idx(bp) == type) - count += hw_breakpoint_weight(bp); + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->ctx == ctx && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); } - raw_spin_unlock_irqrestore(&ctx->lock, flags); - return count; } @@ -149,7 +141,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk, type); + slots->pinned += task_bp_pinned(bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk, type); + nr += task_bp_pinned(bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; @@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(tsk, type); + old_count = task_bp_pinned(bp, type); old_idx = old_count - 1; idx = old_idx + weight; + /* tsk_pinned[n] is the number of tasks having n breakpoints */ tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { tsk_pinned[idx]++; @@ -222,23 +215,41 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int cpu = bp->cpu; struct task_struct *tsk = bp->ctx->task; + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + /* Pinned counter task profiling */ - if (tsk) { - if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; - } + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; + toggle_bp_task_slot(bp, cpu, enable, type, weight); } - /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ } /* @@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ @@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); + arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e1497481fe8..c3003e9d91a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_TIMER)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; desc->status |= IRQ_SUSPENDED; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c78017..2dc3786349d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -14,6 +14,8 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/freezer.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -35,6 +37,7 @@ struct kthread_create_info struct kthread { int should_stop; + void *data; struct completion exited; }; @@ -54,6 +57,19 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_data - return data value specified on kthread creation + * @task: kthread task in question + * + * Return the data value specified when kthread @task was created. + * The caller is responsible for ensuring the validity of @task when + * calling this function. + */ +void *kthread_data(struct task_struct *task) +{ + return to_kthread(task)->data; +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -64,6 +80,7 @@ static int kthread(void *_create) int ret; self.should_stop = 0; + self.data = data; init_completion(&self.exited); current->vfork_done = &self.exited; @@ -247,3 +264,150 @@ int kthreadd(void *unused) return 0; } + +/** + * kthread_worker_fn - kthread function to process kthread_worker + * @worker_ptr: pointer to initialized kthread_worker + * + * This function can be used as @threadfn to kthread_create() or + * kthread_run() with @worker_ptr argument pointing to an initialized + * kthread_worker. The started kthread will process work_list until + * the it is stopped with kthread_stop(). A kthread can also call + * this function directly after extra initialization. + * + * Different kthreads can be used for the same kthread_worker as long + * as there's only one kthread attached to it at any given time. A + * kthread_worker without an attached kthread simply collects queued + * kthread_works. + */ +int kthread_worker_fn(void *worker_ptr) +{ + struct kthread_worker *worker = worker_ptr; + struct kthread_work *work; + + WARN_ON(worker->task); + worker->task = current; +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock_irq(&worker->lock); + worker->task = NULL; + spin_unlock_irq(&worker->lock); + return 0; + } + + work = NULL; + spin_lock_irq(&worker->lock); + if (!list_empty(&worker->work_list)) { + work = list_first_entry(&worker->work_list, + struct kthread_work, node); + list_del_init(&work->node); + } + spin_unlock_irq(&worker->lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->func(work); + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ + work->done_seq = work->queue_seq; + smp_mb(); /* mb worker-b1 paired with flush-b0 */ + if (atomic_read(&work->flushing)) + wake_up_all(&work->done); + } else if (!freezing(current)) + schedule(); + + try_to_freeze(); + goto repeat; +} +EXPORT_SYMBOL_GPL(kthread_worker_fn); + +/** + * queue_kthread_work - queue a kthread_work + * @worker: target kthread_worker + * @work: kthread_work to queue + * + * Queue @work to work processor @task for async execution. @task + * must have been created with kthread_worker_create(). Returns %true + * if @work was successfully queued, %false if it was already pending. + */ +bool queue_kthread_work(struct kthread_worker *worker, + struct kthread_work *work) +{ + bool ret = false; + unsigned long flags; + + spin_lock_irqsave(&worker->lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &worker->work_list); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_kthread_work); + +/** + * flush_kthread_work - flush a kthread_work + * @work: work to flush + * + * If @work is queued or executing, wait for it to finish execution. + */ +void flush_kthread_work(struct kthread_work *work) +{ + int seq = work->queue_seq; + + atomic_inc(&work->flushing); + + /* + * mb flush-b0 paired with worker-b1, to make sure either + * worker sees the above increment or we see done_seq update. + */ + smp_mb__after_atomic_inc(); + + /* A - B <= 0 tests whether B is in front of A regardless of overflow */ + wait_event(work->done, seq - work->done_seq <= 0); + atomic_dec(&work->flushing); + + /* + * rmb flush-b1 paired with worker-b0, to make sure our caller + * sees every change made by work->func(). + */ + smp_mb__after_atomic_dec(); +} +EXPORT_SYMBOL_GPL(flush_kthread_work); + +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + +/** + * flush_kthread_worker - flush all current works on a kthread_worker + * @worker: worker to flush + * + * Wait until all currently executing or pending works on @worker are + * finished. + */ +void flush_kthread_worker(struct kthread_worker *worker) +{ + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + + queue_kthread_work(worker, &fwork.work); + wait_for_completion(&fwork.done); +} +EXPORT_SYMBOL_GPL(flush_kthread_worker); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37..f2852a51023 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) diff --git a/kernel/module.c b/kernel/module.c index 5d2d28197c8..d0b5f8db11b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,6 +1,6 @@ /* Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -110,6 +110,20 @@ int unregister_module_notifier(struct notifier_block * nb) } EXPORT_SYMBOL(unregister_module_notifier); +struct load_info { + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long *strmap; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + /* We require a truly strong try_module_get(): 0 means failure due to ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) @@ -140,42 +154,38 @@ void __module_put_and_exit(struct module *mod, long code) EXPORT_SYMBOL(__module_put_and_exit); /* Find a module section: 0 means not found. */ -static unsigned int find_sec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, - const char *name) +static unsigned int find_sec(const struct load_info *info, const char *name) { unsigned int i; - for (i = 1; i < hdr->e_shnum; i++) + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; /* Alloc bit cleared means "ignore it." */ - if ((sechdrs[i].sh_flags & SHF_ALLOC) - && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) return i; + } return 0; } /* Find a module section, or NULL. */ -static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, - const char *secstrings, const char *name) +static void *section_addr(const struct load_info *info, const char *name) { /* Section 0 has sh_addr 0. */ - return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; } /* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, +static void *section_objs(const struct load_info *info, const char *name, size_t object_size, unsigned int *num) { - unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + unsigned int sec = find_sec(info, name); /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = sechdrs[sec].sh_size / object_size; - return (void *)sechdrs[sec].sh_addr; + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; } /* Provided by the linker */ @@ -227,7 +237,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, unsigned int symnum, void *data), void *data) { struct module *mod; - const struct symsearch arr[] = { + static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, @@ -392,7 +402,8 @@ static int percpu_modalloc(struct module *mod, mod->percpu = __alloc_reserved_percpu(size, align); if (!mod->percpu) { printk(KERN_WARNING - "Could not allocate %lu bytes percpu data\n", size); + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, size); return -ENOMEM; } mod->percpu_size = size; @@ -404,11 +415,9 @@ static void percpu_modfree(struct module *mod) free_percpu(mod->percpu); } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { - return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); + return find_sec(info, ".data..percpu"); } static void percpu_modcopy(struct module *mod, @@ -468,9 +477,7 @@ static inline int percpu_modalloc(struct module *mod, static inline void percpu_modfree(struct module *mod) { } -static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { return 0; } @@ -524,21 +531,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); /* Init the unload section of the module. */ -static void module_unload_init(struct module *mod) +static int module_unload_init(struct module *mod) { - int cpu; + mod->refptr = alloc_percpu(struct module_ref); + if (!mod->refptr) + return -ENOMEM; INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); - for_each_possible_cpu(cpu) { - per_cpu_ptr(mod->refptr, cpu)->incs = 0; - per_cpu_ptr(mod->refptr, cpu)->decs = 0; - } /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; + + return 0; } /* Does a already use b? */ @@ -618,6 +625,8 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); + + free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -787,7 +796,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); free_module(mod); return 0; @@ -892,8 +900,9 @@ int ref_module(struct module *a, struct module *b) } EXPORT_SYMBOL_GPL(ref_module); -static inline void module_unload_init(struct module *mod) +static inline int module_unload_init(struct module *mod) { + return 0; } #endif /* CONFIG_MODULE_UNLOAD */ @@ -1052,10 +1061,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, #endif /* CONFIG_MODVERSIONS */ /* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, - unsigned int versindex, +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, const char *name, - struct module *mod, char ownername[]) { struct module *owner; @@ -1069,7 +1077,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, if (!sym) goto unlock; - if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, + owner)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -1088,21 +1097,20 @@ unlock: return sym; } -static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *name, - struct module *mod) +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) { const struct kernel_symbol *ksym; - char ownername[MODULE_NAME_LEN]; + char owner[MODULE_NAME_LEN]; if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, - mod, ownername)) || - PTR_ERR(ksym) != -EBUSY, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, ownername); + mod->name, owner); } return ksym; } @@ -1111,8 +1119,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> */ -#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) +#ifdef CONFIG_SYSFS +#ifdef CONFIG_KALLSYMS static inline bool sect_empty(const Elf_Shdr *sect) { return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; @@ -1149,8 +1158,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; @@ -1158,8 +1166,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, struct attribute **gattr; /* Count loaded sections and allocate structures */ - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1176,11 +1184,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < nsect; i++) { - if (sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) continue; - sattr->address = sechdrs[i].sh_addr; - sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + sattr->address = sec->sh_addr; + sattr->name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); if (sattr->name == NULL) goto out; @@ -1248,8 +1257,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; @@ -1261,9 +1269,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i]) && - (sechdrs[i].sh_type == SHT_NOTE)) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) ++notes; if (notes == 0) @@ -1277,15 +1285,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < nsect; ++i) { - if (sect_empty(&sechdrs[i])) + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) continue; - if (sechdrs[i].sh_type == SHT_NOTE) { + if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; - nattr->size = sechdrs[i].sh_size; - nattr->private = (void *) sechdrs[i].sh_addr; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; nattr->read = module_notes_read; ++nattr; } @@ -1316,8 +1324,8 @@ static void remove_notes_attrs(struct module *mod) #else -static inline void add_sect_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) { } @@ -1325,17 +1333,16 @@ static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) { } static inline void remove_notes_attrs(struct module *mod) { } -#endif +#endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_SYSFS static void add_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD @@ -1440,6 +1447,7 @@ out: } static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { @@ -1464,6 +1472,8 @@ static int mod_sysfs_setup(struct module *mod, goto out_unreg_param; add_usage_links(mod); + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1480,33 +1490,26 @@ out: static void mod_sysfs_fini(struct module *mod) { + remove_notes_attrs(mod); + remove_sect_attrs(mod); kobject_put(&mod->mkobj.kobj); } -#else /* CONFIG_SYSFS */ - -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} +#else /* !CONFIG_SYSFS */ -static inline int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { return 0; } -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) +static void mod_sysfs_fini(struct module *mod) { } -static void mod_sysfs_fini(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { } @@ -1516,7 +1519,7 @@ static void del_usage_links(struct module *mod) #endif /* CONFIG_SYSFS */ -static void mod_kobject_remove(struct module *mod) +static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); module_remove_modinfo_attrs(mod); @@ -1546,9 +1549,10 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); mutex_unlock(&module_mutex); - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_remove(mod); + mod_sysfs_teardown(mod); + + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1563,10 +1567,7 @@ static void free_module(struct module *mod) module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); -#if defined(CONFIG_MODULE_UNLOAD) - if (mod->refptr) - free_percpu(mod->refptr); -#endif + /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -1632,25 +1633,23 @@ static int verify_export_symbols(struct module *mod) } /* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - unsigned int versindex, - unsigned int pcpuindex, - struct module *mod) -{ - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; unsigned long secbase; - unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + unsigned int i; int ret = 0; const struct kernel_symbol *ksym; - for (i = 1; i < n; i++) { + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + switch (sym[i].st_shndx) { case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); + DEBUGP("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1663,9 +1662,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol_wait(sechdrs, versindex, - strtab + sym[i].st_name, - mod); + ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; @@ -1677,17 +1674,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, strtab + sym[i].st_name, - PTR_ERR(ksym)); + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; default: /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == pcpuindex) + if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); else - secbase = sechdrs[sym[i].st_shndx].sh_addr; + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; break; } @@ -1696,6 +1692,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + /* Additional bytes needed by arch in front of individual sections */ unsigned int __weak arch_mod_section_prepend(struct module *mod, unsigned int section) @@ -1720,10 +1745,7 @@ static long get_offset(struct module *mod, unsigned int *size, might -- code, read-only data, read-write data, small data. Tally sizes, and place the offsets into sh_entsize fields: high bit means it belongs in init. */ -static void layout_sections(struct module *mod, - const Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static void layout_sections(struct module *mod, struct load_info *info) { static unsigned long const masks[][2] = { /* NOTE: all executable code must be the first section @@ -1736,21 +1758,22 @@ static void layout_sections(struct module *mod, }; unsigned int m, i; - for (i = 0; i < hdr->e_shnum; i++) - sechdrs[i].sh_entsize = ~0UL; + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; DEBUGP("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(secstrings + s->sh_name, ".init")) + || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", name); } if (m == 0) mod->core_text_size = mod->core_size; @@ -1758,17 +1781,18 @@ static void layout_sections(struct module *mod, DEBUGP("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(secstrings + s->sh_name, ".init")) + || !strstarts(sname, ".init")) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", sname); } if (m == 0) mod->init_text_size = mod->init_size; @@ -1807,33 +1831,28 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(Elf_Shdr *sechdrs, - unsigned int info, - const char *tag) +static char *get_modinfo(struct load_info *info, const char *tag) { char *p; unsigned int taglen = strlen(tag); - unsigned long size = sechdrs[info].sh_size; + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; - for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { + for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } -static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, - unsigned int infoindex) +static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) - attr->setup(mod, - get_modinfo(sechdrs, - infoindex, - attr->attr.name)); + attr->setup(mod, get_modinfo(info, attr->attr.name)); } } @@ -1874,11 +1893,10 @@ static int is_exported(const char *name, unsigned long value, } /* As per nm */ -static char elf_type(const Elf_Sym *sym, - Elf_Shdr *sechdrs, - const char *secstrings, - struct module *mod) +static char elf_type(const Elf_Sym *sym, const struct load_info *info) { + const Elf_Shdr *sechdrs = info->sechdrs; + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) return 'v'; @@ -1908,8 +1926,10 @@ static char elf_type(const Elf_Sym *sym, else return 'b'; } - if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { return 'n'; + } return '?'; } @@ -1934,127 +1954,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } -static unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static void layout_symtab(struct module *mod, struct load_info *info) { - unsigned long symoffs; - Elf_Shdr *symsect = sechdrs + symindex; - Elf_Shdr *strsect = sechdrs + strindex; + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - const char *strtab; unsigned int i, nsrc, ndst; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, - symindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + symsect->sh_name); + info->index.sym) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); - src = (void *)hdr + symsect->sh_offset; + src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - strtab = (void *)hdr + strsect->sh_offset; for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; - while(!__test_and_set_bit(j, strmap) && strtab[j]) + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) ++j; ++ndst; } /* Append room for core symbols at end of core part. */ - symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, - strindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + strsect->sh_name); + info->index.str) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); /* Append room for core symbols' strings at end of core part. */ - *pstroffs = mod->core_size; - __set_bit(0, strmap); - mod->core_size += bitmap_weight(strmap, strsect->sh_size); - - return symoffs; + info->stroffs = mod->core_size; + __set_bit(0, info->strmap); + mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -static void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - unsigned long *strmap) +static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; const Elf_Sym *src; Elf_Sym *dst; char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; + mod->symtab = (void *)symsec->sh_addr; + mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info - = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + symoffs; + mod->core_symtab = dst = mod->module_core + info->symoffs; src = mod->symtab; *dst = *src; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, sechdrs, shnum)) + if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + dst[ndst].st_name = bitmap_weight(info->strmap, + dst[ndst].st_name); ++ndst; } mod->core_num_syms = ndst; - mod->core_strtab = s = mod->module_core + stroffs; - for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) - if (test_bit(i, strmap)) + mod->core_strtab = s = mod->module_core + info->stroffs; + for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) + if (test_bit(i, info->strmap)) *++s = mod->strtab[i]; } #else -static inline unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static inline void layout_symtab(struct module *mod, struct load_info *info) { - return 0; } -static inline void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - const unsigned long *strmap) +static void add_kallsyms(struct module *mod, struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { + if (!debug) + return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) printk(KERN_ERR "dynamic debug error adding module: %s\n", @@ -2085,65 +2074,47 @@ static void *module_alloc_update_bounds(unsigned long size) } #ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { unsigned int i; /* only scan the sections containing data */ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - for (i = 1; i < hdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + for (i = 1; i < info->hdr->e_shnum; i++) { + const char *name = info->secstrings + info->sechdrs[i].sh_name; + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) continue; - if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 - && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + if (!strstarts(name, ".data") && !strstarts(name, ".bss")) continue; - kmemleak_scan_area((void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size, GFP_KERNEL); + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); } } #else -static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } #endif -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static noinline struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +/* Sets info->hdr and info->len. */ +static int copy_and_check(struct load_info *info, + const void __user *umod, unsigned long len, + const char __user *uargs) { + int err; Elf_Ehdr *hdr; - Elf_Shdr *sechdrs; - char *secstrings, *args, *modmagic, *strtab = NULL; - char *staging; - unsigned int i; - unsigned int symindex = 0; - unsigned int strindex = 0; - unsigned int modindex, versindex, infoindex, pcpuindex; - struct module *mod; - long err = 0; - void *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long symoffs, stroffs, *strmap; - void __percpu *percpu; - struct _ddebug *debug = NULL; - unsigned int num_debug = 0; - - mm_segment_t old_fs; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); if (len < sizeof(*hdr)) - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return ERR_PTR(-ENOMEM); + return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; @@ -2151,135 +2122,225 @@ static noinline struct module *load_module(void __user *umod, } /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ + weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(*sechdrs)) { + || hdr->e_shentsize != sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } + + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { err = -ENOEXEC; goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) - goto truncated; + info->hdr = hdr; + info->len = len; + return 0; + +free_hdr: + vfree(hdr); + return err; +} + +static void free_copy(struct load_info *info) +{ + vfree(info->hdr); +} + +static int rewrite_section_headers(struct load_info *info) +{ + unsigned int i; - /* Convenience variables */ - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - sechdrs[0].sh_addr = 0; + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; - for (i = 1; i < hdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_NOBITS - && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) - goto truncated; + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (shdr->sh_type != SHT_NOBITS + && info->len < shdr->sh_offset + shdr->sh_size) { + printk(KERN_ERR "Module len %lu truncated\n", + info->len); + return -ENOEXEC; + } /* Mark all sections sh_addr with their address in the temporary image. */ - sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - /* Internal symbols and strings. */ - if (sechdrs[i].sh_type == SHT_SYMTAB) { - symindex = i; - strindex = sechdrs[i].sh_link; - strtab = (char *)hdr + sechdrs[strindex].sh_offset; - } #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; + if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } - modindex = find_sec(hdr, sechdrs, secstrings, - ".gnu.linkonce.this_module"); - if (!modindex) { + /* Track but don't keep modinfo and version sections. */ + info->index.vers = find_sec(info, "__versions"); + info->index.info = find_sec(info, ".modinfo"); + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Return the temporary module pointer (we'll replace it with the final + * one when we move the module sections around). + */ +static struct module *setup_load_info(struct load_info *info) +{ + unsigned int i; + int err; + struct module *mod; + + /* Set up the convenience variables */ + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + info->secstrings = (void *)info->hdr + + info->sechdrs[info->hdr->e_shstrndx].sh_offset; + + err = rewrite_section_headers(info); + if (err) + return ERR_PTR(err); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { printk(KERN_WARNING "No module found in object\n"); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ - mod = (void *)sechdrs[modindex].sh_addr; + mod = (void *)info->sechdrs[info->index.mod].sh_addr; - if (symindex == 0) { + if (info->index.sym == 0) { printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", mod->name); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } - versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); - infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); - pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); - - /* Don't keep modinfo and version sections. */ - sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(sechdrs, versindex, mod)) { - err = -ENOEXEC; - goto free_hdr; - } + if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + return ERR_PTR(-ENOEXEC); + + return mod; +} + +static int check_modinfo(struct module *mod, struct load_info *info) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; - modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); if (err) - goto free_hdr; - } else if (!same_magic(modmagic, vermagic, versindex)) { + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); - err = -ENOEXEC; - goto free_hdr; + return -ENOEXEC; } - staging = get_modinfo(sechdrs, infoindex, "staging"); - if (staging) { + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); } - /* Now copy in args */ - args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(args)) { - err = PTR_ERR(args); - goto free_hdr; - } + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); - strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) - * sizeof(long), GFP_KERNEL); - if (!strmap) { - err = -ENOMEM; - goto free_mod; - } + return 0; +} - mod->state = MODULE_STATE_COMING; +static void find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(info, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); - if (err < 0) - goto free_mod; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->unused_syms = section_objs(info, "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(info, "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); +#endif +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); +#endif - if (pcpuindex) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); - if (err) - goto free_mod; - sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - } - /* Keep this around for failure path. */ - percpu = mod_percpu(mod); +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(info, "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, hdr, sechdrs, secstrings); - symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, - secstrings, &stroffs, strmap); + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + info->debug = section_objs(info, "__verbose", + sizeof(*info->debug), &info->num_debug); +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2289,10 +2350,9 @@ static noinline struct module *load_module(void __user *umod, * leak. */ kmemleak_not_leak(ptr); - if (!ptr) { - err = -ENOMEM; - goto free_percpu; - } + if (!ptr) + return -ENOMEM; + memset(ptr, 0, mod->core_size); mod->module_core = ptr; @@ -2305,50 +2365,40 @@ static noinline struct module *load_module(void __user *umod, */ kmemleak_ignore(ptr); if (!ptr && mod->init_size) { - err = -ENOMEM; - goto free_core; + module_free(mod, mod->module_core); + return -ENOMEM; } memset(ptr, 0, mod->init_size); mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ DEBUGP("final section addresses:\n"); - for (i = 0; i < hdr->e_shnum; i++) { + for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (!(shdr->sh_flags & SHF_ALLOC)) continue; - if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) + if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->module_init - + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + sechdrs[i].sh_entsize; + dest = mod->module_core + shdr->sh_entsize; - if (sechdrs[i].sh_type != SHT_NOBITS) - memcpy(dest, (void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size); + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ - sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); + shdr->sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", + shdr->sh_addr, info->secstrings + shdr->sh_name); } - /* Module has been moved. */ - mod = (void *)sechdrs[modindex].sh_addr; - kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) { - err = -ENOMEM; - goto free_init; - } -#endif - /* Now we've moved module, initialize linked lists, etc. */ - module_unload_init(mod); - - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + return 0; +} +static int check_module_license_and_versions(struct module *mod) +{ /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -2361,77 +2411,6 @@ static noinline struct module *load_module(void __user *umod, if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, sechdrs, infoindex); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, - mod); - if (err < 0) - goto cleanup; - - /* Now we've got everything in the final locations, we can - * find optional sections. */ - mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); - mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused_gpl"); -#endif -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); -#endif - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(hdr, sechdrs, secstrings, - "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(hdr, sechdrs, secstrings, - "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, - "__mcount_loc", - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2441,56 +2420,16 @@ static noinline struct module *load_module(void __user *umod, || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { - err = try_to_force_load(mod, - "no versions for exported symbols"); - if (err) - goto cleanup; + return try_to_force_load(mod, + "no versions for exported symbols"); } #endif + return 0; +} - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { - const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(sechdrs[info].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(sechdrs, strtab, symindex, i,mod); - else if (sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(sechdrs, strtab, symindex, i, - mod); - if (err < 0) - goto cleanup; - } - - /* Set up and sort exception table */ - mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Finally, copy percpu area over. */ - percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, - sechdrs[pcpuindex].sh_size); - - add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, - symoffs, stroffs, secstrings, strmap); - kfree(strmap); - strmap = NULL; - - if (!mod->taints) - debug = section_objs(hdr, sechdrs, secstrings, "__verbose", - sizeof(*debug), &num_debug); - - err = module_finalize(hdr, sechdrs, mod); - if (err < 0) - goto cleanup; +static void flush_module_icache(const struct module *mod) +{ + mm_segment_t old_fs; /* flush the icache in correct context */ old_fs = get_fs(); @@ -2509,11 +2448,160 @@ static noinline struct module *load_module(void __user *umod, (unsigned long)mod->module_core + mod->core_size); set_fs(old_fs); +} - mod->args = args; - if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); +static struct module *layout_and_allocate(struct load_info *info) +{ + /* Module within temporary copy. */ + struct module *mod; + Elf_Shdr *pcpusec; + int err; + + mod = setup_load_info(info); + if (IS_ERR(mod)) + return mod; + + err = check_modinfo(mod, info); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, mod); + if (err < 0) + goto out; + + pcpusec = &info->sechdrs[info->index.pcpu]; + if (pcpusec->sh_size) { + /* We have a special allocation for this section. */ + err = percpu_modalloc(mod, + pcpusec->sh_size, pcpusec->sh_addralign); + if (err) + goto out; + pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, info); + + info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) + * sizeof(long), GFP_KERNEL); + if (!info->strmap) { + err = -ENOMEM; + goto free_percpu; + } + layout_symtab(mod, info); + + /* Allocate and move to the final place */ + err = move_module(mod, info); + if (err) + goto free_strmap; + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; + +free_strmap: + kfree(info->strmap); +free_percpu: + percpu_modfree(mod); +out: + return ERR_PTR(err); +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + kfree(info->strmap); + percpu_modfree(mod); + module_free(mod, mod->module_init); + module_free(mod, mod->module_core); +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct load_info info = { NULL, }; + struct module *mod; + long err; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + /* Copy in the blobs from userspace, check they are vaguely sane. */ + err = copy_and_check(&info, umod, len, uargs); + if (err) + return ERR_PTR(err); + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(&info); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto free_module; + + /* Now we've got everything in the final locations, we can + * find optional sections. */ + find_module_sections(mod, &info); + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, &info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, &info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, &info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, &info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + /* Mark state as coming so strong_try_module_get() ignores us. */ + mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since @@ -2528,8 +2616,9 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } - if (debug) - dynamic_debug_setup(debug, num_debug); + /* This has to be done once we're sure module name is unique. */ + if (!mod->taints) + dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2539,23 +2628,22 @@ static noinline struct module *load_module(void __user *umod, list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); + /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); + /* Link in to syfs. */ + err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); if (err < 0) goto unlink; - add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - - /* Get rid of temporary copy */ - vfree(hdr); - - trace_module_load(mod); + /* Get rid of temporary copy and strmap. */ + kfree(info.strmap); + free_copy(&info); /* Done! */ + trace_module_load(mod); return mod; unlink: @@ -2563,35 +2651,23 @@ static noinline struct module *load_module(void __user *umod, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); ddebug: - dynamic_debug_remove(debug); + if (!mod->taints) + dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); + kfree(mod->args); + free_arch_cleanup: module_arch_cleanup(mod); - cleanup: + free_modinfo: free_modinfo(mod); + free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) - free_percpu(mod->refptr); - free_init: -#endif - module_free(mod, mod->module_init); - free_core: - module_free(mod, mod->module_core); - /* mod will be freed with core. Don't access it beyond this line! */ - free_percpu: - free_percpu(percpu); - free_mod: - kfree(args); - kfree(strmap); - free_hdr: - vfree(hdr); + free_module: + module_deallocate(mod, &info); + free_copy: + free_copy(&info); return ERR_PTR(err); - - truncated: - printk(KERN_ERR "Module len %lu truncated\n", len); - err = -ENOEXEC; - goto free_hdr; } /* Call module constructors. */ diff --git a/kernel/padata.c b/kernel/padata.c index fdd8ae609ce..751019415d2 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -26,18 +26,19 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; - target_cpu = cpumask_first(pd->cpumask); + target_cpu = cpumask_first(pd->cpumask.pcpu); for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask); + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); return target_cpu; } @@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *work) +static void padata_parallel_worker(struct work_struct *parallel_work) { - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, pwork); - pd = queue->pd; + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; pinst = pd->pinst; - spin_lock(&queue->parallel.lock); - list_replace_init(&queue->parallel.list, &local_list); - spin_unlock(&queue->parallel.lock); + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work) * @pinst: padata instance * @padata: object to be parallelized * @cb_cpu: cpu the serialization callback function will run on, - * must be in the cpumask of padata. + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel @@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu) { int target_cpu, err; - struct padata_queue *queue; + struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); pd = rcu_dereference(pinst->pd); - err = 0; - if (!(pinst->flags & PADATA_INIT)) + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) goto out; err = -EBUSY; @@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) goto out; - err = -EINVAL; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) - goto out; - - err = -EINPROGRESS; + err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = cb_cpu; @@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, padata->seq_nr = atomic_inc_return(&pd->seq_nr); target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->queue, target_cpu); + queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->pwork); + queue_work_on(target_cpu, pinst->wq, &queue->work); out: rcu_read_unlock_bh(); @@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus, empty, calc_seq_nr; - int seq_nr, next_nr, overrun, next_overrun; - struct padata_queue *queue, *next_queue; + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; - empty = 0; - next_nr = -1; - next_overrun = 0; - next_queue = NULL; - - num_cpus = cpumask_weight(pd->cpumask); - - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - reorder = &queue->reorder; - - /* - * Calculate the seq_nr of the object that should be - * next in this reorder queue. - */ - overrun = 0; - calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) - + queue->cpu_index; + num_cpus = cpumask_weight(pd->cpumask.pcpu); - if (unlikely(calc_seq_nr > pd->max_seq_nr)) { - calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; - overrun = 1; - } - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - seq_nr = padata->seq_nr; - BUG_ON(calc_seq_nr != seq_nr); - } else { - seq_nr = calc_seq_nr; - empty++; - } - - if (next_nr < 0 || seq_nr < next_nr - || (next_overrun && !overrun)) { - next_nr = seq_nr; - next_overrun = overrun; - next_queue = queue; - } + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; } padata = NULL; - if (empty == num_cpus) - goto out; - reorder = &next_queue->reorder; if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - if (unlikely(next_overrun)) { - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - atomic_set(&queue->num_obj, 0); - } - } + BUG_ON(next_nr != padata->seq_nr); spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); spin_unlock(&reorder->lock); - atomic_inc(&next_queue->num_obj); + pd->processed++; goto out; } - queue = per_cpu_ptr(pd->queue, smp_processor_id()); + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); if (queue->cpu_index == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; @@ -262,7 +231,7 @@ out: static void padata_reorder(struct parallel_data *pd) { struct padata_priv *padata; - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; /* @@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd) return; } - queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - spin_lock(&queue->serial.lock); - list_add_tail(&padata->list, &queue->serial.list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg) padata_reorder(pd); } -static void padata_serial_worker(struct work_struct *work) +static void padata_serial_worker(struct work_struct *serial_work) { - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, swork); - pd = queue->pd; + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; - spin_lock(&queue->serial.lock); - list_replace_init(&queue->serial.list, &local_list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) void padata_do_serial(struct padata_priv *padata) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; pd = padata->pd; cpu = get_cpu(); - queue = per_cpu_ptr(pd->queue, cpu); + pqueue = per_cpu_ptr(pd->pqueue, cpu); - spin_lock(&queue->reorder.lock); + spin_lock(&pqueue->reorder.lock); atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &queue->reorder.list); - spin_unlock(&queue->reorder.lock); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); put_cpu(); @@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -/* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *cpumask) +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int cpu, cpu_index, num_cpus; - struct padata_queue *queue; - struct parallel_data *pd; - - cpu_index = 0; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } - pd->queue = alloc_percpu(struct padata_queue); - if (!pd->queue) - goto err_free_pd; + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} - if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) - goto err_free_queue; +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} - cpumask_and(pd->cpumask, cpumask, cpu_active_mask); +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} - queue->pd = pd; +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; - queue->cpu_index = cpu_index; + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; cpu_index++; - INIT_LIST_HEAD(&queue->reorder.list); - INIT_LIST_HEAD(&queue->parallel.list); - INIT_LIST_HEAD(&queue->serial.list); - spin_lock_init(&queue->reorder.lock); - spin_lock_init(&queue->parallel.lock); - spin_lock_init(&queue->serial.lock); - - INIT_WORK(&queue->pwork, padata_parallel_worker); - INIT_WORK(&queue->swork, padata_serial_worker); - atomic_set(&queue->num_obj, 0); + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); } - num_cpus = cpumask_weight(pd->cpumask); - pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} + +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; + + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + + padata_init_pqueues(pd); + padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); @@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, return pd; -err_free_queue: - free_percpu(pd->queue); +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); err_free_pd: kfree(pd); err: @@ -456,8 +464,10 @@ err: static void padata_free_pd(struct parallel_data *pd) { - free_cpumask_var(pd->cpumask); - free_percpu(pd->queue); + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); kfree(pd); } @@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd) static void padata_flush_queues(struct parallel_data *pd) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->pwork); + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); } del_timer_sync(&pd->timer); @@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd) if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->swork); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); } BUG_ON(atomic_read(&pd->refcnt) != 0); } +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + /* Replace the internal control stucture with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; pinst->flags |= PADATA_RESET; @@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst, synchronize_rcu(); + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + padata_flush_queues(pd_old); padata_free_pd(pd_old); + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + pinst->flags &= ~PADATA_RESET; } /** - * padata_set_cpumask - set the cpumask that padata should use + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. * - * @pinst: padata instance - * @cpumask: the cpumask to use + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. */ -int padata_set_cpumask(struct padata_instance *pinst, - cpumask_var_t cpumask) +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) { + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); + +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); + + +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; struct parallel_data *pd; - int err = 0; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; + } + + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + padata_replace(pinst, pd); + + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; mutex_lock(&pinst->lock); + get_online_cpus(); + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); get_online_cpus(); - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) { - err = -ENOMEM; - goto out; + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; } - cpumask_copy(pinst->cpumask, cpumask); - - padata_replace(pinst, pd); + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: put_online_cpus(); - mutex_unlock(&pinst->lock); return err; @@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) struct parallel_data *pd; if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); } return 0; } -/** - * padata_add_cpu - add a cpu to the padata cpumask + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_add_cpu(struct padata_instance *pinst, int cpu) + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_set_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_add_cpu(pinst, cpu); put_online_cpus(); @@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + struct parallel_data *pd = NULL; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; @@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } -/** - * padata_remove_cpu - remove a cpu from the padata cpumask + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu) +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_clear_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_remove_cpu(pinst, cpu); put_online_cpus(); @@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu); * * @pinst: padata instance to start */ -void padata_start(struct padata_instance *pinst) +int padata_start(struct padata_instance *pinst) { + int err = 0; + mutex_lock(&pinst->lock); - pinst->flags |= PADATA_INIT; + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + mutex_unlock(&pinst->lock); + + return err; } EXPORT_SYMBOL(padata_start); @@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start); void padata_stop(struct padata_instance *pinst) { mutex_lock(&pinst->lock); - pinst->flags &= ~PADATA_INIT; + __padata_stop(pinst); mutex_unlock(&pinst->lock); } EXPORT_SYMBOL(padata_stop); #ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + static int padata_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_add_cpu(pinst, cpu); @@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_remove_cpu(pinst, cpu); @@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_remove_cpu(pinst, cpu); @@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_add_cpu(pinst, cpu); @@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb, } #endif +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); + +/* + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + /** - * padata_alloc - allocate and initialize a padata instance + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. * - * @cpumask: cpumask that padata uses for parallelization * @wq: workqueue to use for the allocated padata instance */ -struct padata_instance *padata_alloc(const struct cpumask *cpumask, - struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd; + struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) goto err; get_online_cpus(); - - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); + goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; - if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) - goto err_free_pd; + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; - cpumask_copy(pinst->cpumask, cpumask); + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); pinst->flags = 0; @@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, put_online_cpus(); + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); return pinst; -err_free_pd: - padata_free_pd(pd); +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); put_online_cpus(); @@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc); */ void padata_free(struct padata_instance *pinst) { - padata_stop(pinst); - - synchronize_rcu(); - -#ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); -#endif - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); - - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask); - kfree(pinst); + kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4..403d1804b19 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* @@ -675,7 +675,6 @@ group_sched_in(struct perf_event *group_event, struct perf_event *event, *partial_group = NULL; const struct pmu *pmu = group_event->pmu; bool txn = false; - int ret; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; @@ -703,14 +702,8 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn) - return 0; - - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); + if (!txn || !pmu->commit_txn(pmu)) return 0; - } group_error: /* @@ -1155,9 +1148,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_event->count); - value = atomic64_xchg(&event->count, value); - atomic64_set(&next_event->count, value); + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); @@ -1547,10 +1540,10 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; - if (atomic64_read(&hwc->period_left) > 8*sample_period) { + if (local64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); perf_event_stop(event); - atomic64_set(&hwc->period_left, 0); + local64_set(&hwc->period_left, 0); perf_event_start(event); perf_enable(); } @@ -1591,7 +1584,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) perf_disable(); event->pmu->read(event); - now = atomic64_read(&event->count); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; @@ -1743,6 +1736,11 @@ static void __perf_event_read(void *info) event->pmu->read(event); } +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -1762,7 +1760,7 @@ static u64 perf_event_read(struct perf_event *event) raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return atomic64_read(&event->count); + return perf_event_count(event); } /* @@ -1883,7 +1881,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); -static void perf_mmap_data_put(struct perf_mmap_data *data); +static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { @@ -1891,7 +1889,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { atomic_dec(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); @@ -1899,9 +1897,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->data) { - perf_mmap_data_put(event->data); - event->data = NULL; + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; } if (event->destroy) @@ -2126,13 +2124,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) - events = atomic_xchg(&data->poll, 0); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -2143,7 +2141,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); - atomic64_set(&event->count, 0); + local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -2342,14 +2340,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; - userpg = data->user_page; + userpg = buffer->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -2359,9 +2357,9 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = atomic64_read(&event->count); + userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= atomic64_read(&event->hw.prev_count); + userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -2376,6 +2374,25 @@ unlock: rcu_read_unlock(); } +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2383,15 +2400,15 @@ unlock: */ static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > data->nr_pages) + if (pgoff > buffer->nr_pages) return NULL; if (pgoff == 0) - return virt_to_page(data->user_page); + return virt_to_page(buffer->user_page); - return virt_to_page(data->data_pages[pgoff - 1]); + return virt_to_page(buffer->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) @@ -2407,42 +2424,44 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; int i; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - data->user_page = perf_mmap_alloc_page(event->cpu); - if (!data->user_page) + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = perf_mmap_alloc_page(event->cpu); - if (!data->data_pages[i]) + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) goto fail_data_pages; } - data->nr_pages = nr_pages; + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); + free_page((unsigned long)buffer->data_pages[i]); - free_page((unsigned long)data->user_page); + free_page((unsigned long)buffer->user_page); fail_user_page: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2456,17 +2475,17 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { int i; - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - kfree(data); + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); } -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { return 0; } @@ -2479,18 +2498,18 @@ static inline int page_order(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { - return data->page_order; + return buffer->page_order; } static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > (1UL << page_order(data))) + if (pgoff > (1UL << page_order(buffer))) return NULL; - return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) @@ -2500,57 +2519,59 @@ static void perf_mmap_unmark_page(void *addr) page->mapping = NULL; } -static void perf_mmap_data_free_work(struct work_struct *work) +static void perf_buffer_free_work(struct work_struct *work) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; void *base; int i, nr; - data = container_of(work, struct perf_mmap_data, work); - nr = 1 << page_order(data); + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); - base = data->user_page; + base = buffer->user_page; for (i = 0; i < nr + 1; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); - kfree(data); + kfree(buffer); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { - schedule_work(&data->work); + schedule_work(&buffer->work); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; void *all_buf; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - INIT_WORK(&data->work, perf_mmap_data_free_work); + INIT_WORK(&buffer->work, perf_buffer_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; - data->user_page = all_buf; - data->data_pages[0] = all_buf + PAGE_SIZE; - data->page_order = ilog2(nr_pages); - data->nr_pages = 1; + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_all_buf: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2558,15 +2579,15 @@ fail: #endif -static unsigned long perf_data_size(struct perf_mmap_data *data) +static unsigned long perf_data_size(struct perf_buffer *buffer) { - return data->nr_pages << (PAGE_SHIFT + page_order(data)); + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -2576,14 +2597,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(data, vmf->pgoff); + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); if (!vmf->page) goto unlock; @@ -2598,52 +2619,35 @@ unlock: return ret; } -static void -perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) -{ - long max_size = perf_data_size(data); - - if (event->attr.watermark) { - data->watermark = min_t(long, max_size, - event->attr.wakeup_watermark); - } - - if (!data->watermark) - data->watermark = max_size / 2; - - atomic_set(&data->refcount, 1); - rcu_assign_pointer(event->data, data); -} - -static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_data_free(data); + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); } -static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) +static struct perf_buffer *perf_buffer_get(struct perf_event *event) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) { - if (!atomic_inc_not_zero(&data->refcount)) - data = NULL; + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; } rcu_read_unlock(); - return data; + return buffer; } -static void perf_mmap_data_put(struct perf_mmap_data *data) +static void perf_buffer_put(struct perf_buffer *buffer) { - if (!atomic_dec_and_test(&data->refcount)) + if (!atomic_dec_and_test(&buffer->refcount)) return; - call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2658,16 +2662,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->data); + unsigned long size = perf_data_size(event->buffer); struct user_struct *user = event->mmap_user; - struct perf_mmap_data *data = event->data; + struct perf_buffer *buffer = event->buffer; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->data, NULL); + rcu_assign_pointer(event->buffer, NULL); mutex_unlock(&event->mmap_mutex); - perf_mmap_data_put(data); + perf_buffer_put(buffer); free_uid(user); } } @@ -2685,11 +2689,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; - int ret = 0; + int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -2706,7 +2710,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have data pages ensure they're a power-of-two number, so we + * If we have buffer pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -2720,9 +2724,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->data) { - if (event->data->nr_pages == nr_pages) - atomic_inc(&event->data->refcount); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); else ret = -EINVAL; goto unlock; @@ -2752,17 +2756,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->data); + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; - data = perf_mmap_data_alloc(event, nr_pages); - if (!data) { + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { ret = -ENOMEM; goto unlock; } - - perf_mmap_data_init(event, data); - if (vma->vm_flags & VM_WRITE) - event->data->writable = 1; + rcu_assign_pointer(event->buffer, buffer); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -2941,11 +2946,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } -__weak -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ -} - /* * We assume there is only KVM supporting the callbacks. @@ -2971,15 +2971,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, unsigned long offset, unsigned long head) { unsigned long mask; - if (!data->writable) + if (!buffer->writable) return true; - mask = perf_data_size(data) - 1; + mask = perf_data_size(buffer) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2992,7 +2992,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->data->poll, POLL_IN); + atomic_set(&handle->buffer->poll, POLL_IN); if (handle->nmi) { handle->event->pending_wakeup = 1; @@ -3012,45 +3012,45 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; preempt_disable(); - local_inc(&data->nest); - handle->wakeup = local_read(&data->wakeup); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; unsigned long head; again: - head = local_read(&data->head); + head = local_read(&buffer->head); /* * IRQ/NMI can happen here, which means we can miss a head update. */ - if (!local_dec_and_test(&data->nest)) + if (!local_dec_and_test(&buffer->nest)) goto out; /* * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the data->head read and this + * by atomic_dec_and_test() order the buffer->head read and this * write. */ - data->user_page->data_head = head; + buffer->user_page->data_head = head; /* * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read data->head. + * barrier in atomic_dec_and_test() to re-read buffer->head. */ - if (unlikely(head != local_read(&data->head))) { - local_inc(&data->nest); + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); goto again; } - if (handle->wakeup != local_read(&data->wakeup)) + if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); out: @@ -3070,12 +3070,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, buf += size; handle->size -= size; if (!handle->size) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; handle->page++; - handle->page &= data->nr_pages - 1; - handle->addr = data->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(data); + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); } } while (len); } @@ -3084,7 +3084,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; struct { @@ -3100,19 +3100,19 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto out; - handle->data = data; + handle->buffer = buffer; handle->event = event; handle->nmi = nmi; handle->sample = sample; - if (!data->nr_pages) + if (!buffer->nr_pages) goto out; - have_lost = local_read(&data->lost); + have_lost = local_read(&buffer->lost); if (have_lost) size += sizeof(lost_event); @@ -3124,30 +3124,30 @@ int perf_output_begin(struct perf_output_handle *handle, * tail pointer. So that all reads will be completed before the * write is issued. */ - tail = ACCESS_ONCE(data->user_page->data_tail); + tail = ACCESS_ONCE(buffer->user_page->data_tail); smp_rmb(); - offset = head = local_read(&data->head); + offset = head = local_read(&buffer->head); head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) + if (unlikely(!perf_output_space(buffer, tail, offset, head))) goto fail; - } while (local_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&buffer->head, offset, head) != offset); - if (head - local_read(&data->wakeup) > data->watermark) - local_add(data->watermark, &data->wakeup); + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(data)); - handle->page &= data->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); - handle->addr = data->data_pages[handle->page]; + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(data)) - handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = local_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); } @@ -3155,7 +3155,7 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - local_inc(&data->lost); + local_inc(&buffer->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3166,15 +3166,15 @@ out: void perf_output_end(struct perf_output_handle *handle) { struct perf_event *event = handle->event; - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = local_inc_return(&data->events); + int events = local_inc_return(&buffer->events); if (events >= wakeup_events) { - local_sub(wakeup_events, &data->events); - local_inc(&data->wakeup); + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); } } @@ -3211,7 +3211,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[4]; int n = 0; - values[n++] = atomic64_read(&event->count); + values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -3248,7 +3248,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (leader != event) leader->pmu->read(leader); - values[n++] = atomic64_read(&leader->count); + values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); @@ -3260,7 +3260,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (sub != event) sub->pmu->read(sub); - values[n++] = atomic64_read(&sub->count); + values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -3491,7 +3491,7 @@ perf_event_read_event(struct perf_event *event, /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task */ struct perf_task_event { @@ -3541,7 +3541,8 @@ static int perf_event_task_match(struct perf_event *event) if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.comm || event->attr.mmap || event->attr.task) + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) return 1; return 0; @@ -3766,7 +3767,8 @@ static void perf_event_mmap_output(struct perf_event *event, } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; @@ -3774,19 +3776,21 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.mmap) + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) return 1; return 0; } static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event)) + if (perf_event_mmap_match(event, mmap_event, executable)) perf_event_mmap_output(event, mmap_event); } } @@ -3830,6 +3834,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } name = strncpy(tmp, "//anon", sizeof(tmp)); @@ -3846,17 +3858,17 @@ got_name: rcu_read_lock(); cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_event_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); put_cpu_var(perf_cpu_context); rcu_read_unlock(); kfree(buf); } -void __perf_event_mmap(struct vm_area_struct *vma) +void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -4018,14 +4030,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; again: - old = val = atomic64_read(&hwc->period_left); + old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; @@ -4064,7 +4076,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, { struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &event->count); + local64_add(nr, &event->count); if (!regs) return; @@ -4075,7 +4087,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, nmi, data, regs); - if (atomic64_add_negative(nr, &hwc->period_left)) + if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, nmi, data, regs); @@ -4213,14 +4225,12 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void perf_swevent_put_recursion_context(int rctx) +void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; } -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); - void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4368,8 +4378,8 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_xchg(&event->hw.prev_count, now); - atomic64_add(now - prev, &event->count); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } static int cpu_clock_perf_event_enable(struct perf_event *event) @@ -4377,7 +4387,7 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + local64_set(&hwc->prev_count, cpu_clock(cpu)); perf_swevent_start_hrtimer(event); return 0; @@ -4409,9 +4419,9 @@ static void task_clock_perf_event_update(struct perf_event *event, u64 now) u64 prev; s64 delta; - prev = atomic64_xchg(&event->hw.prev_count, now); + prev = local64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &event->count); + local64_add(delta, &event->count); } static int task_clock_perf_event_enable(struct perf_event *event) @@ -4421,7 +4431,7 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; - atomic64_set(&hwc->prev_count, now); + local64_set(&hwc->prev_count, now); perf_swevent_start_hrtimer(event); @@ -4601,7 +4611,7 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head) + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; @@ -4615,12 +4625,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr); data.raw = &raw; - rcu_read_lock(); hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_add(event, count, 1, &data, regs); } - rcu_read_unlock(); + + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -4864,7 +4874,7 @@ perf_event_alloc(struct perf_event_attr *attr, hwc->sample_period = 1; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_FORMAT_GROUP on inherited events @@ -4913,7 +4923,7 @@ done: if (!event->parent) { atomic_inc(&nr_events); - if (event->attr.mmap) + if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); @@ -5007,7 +5017,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_mmap_data *data = NULL, *old_data = NULL; + struct perf_buffer *buffer = NULL, *old_buffer = NULL; int ret = -EINVAL; if (!output_event) @@ -5037,19 +5047,19 @@ set: if (output_event) { /* get the buffer we want to redirect to */ - data = perf_mmap_data_get(output_event); - if (!data) + buffer = perf_buffer_get(output_event); + if (!buffer) goto unlock; } - old_data = event->data; - rcu_assign_pointer(event->data, data); + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_data) - perf_mmap_data_put(old_data); + if (old_buffer) + perf_buffer_put(old_buffer); out: return ret; } @@ -5298,7 +5308,7 @@ inherit_event(struct perf_event *parent_event, hwc->sample_period = sample_period; hwc->last_period = sample_period; - atomic64_set(&hwc->period_left, sample_period); + local64_set(&hwc->period_left, sample_period); } child_event->overflow_handler = parent_event->overflow_handler; @@ -5359,12 +5369,12 @@ static void sync_child_event(struct perf_event *child_event, if (child_event->attr.inherit_stat) perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_event->count); + child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_event->count); + atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f737a3..996a4dec5f9 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -48,59 +48,49 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_request_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - int pm_qos_class; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct pm_qos_request_list requests; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; -static void update_target(int pm_qos_class) + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct pm_qos_request_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[pm_qos_class]->default_value; - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requests.list, list) { - extreme_value = pm_qos_array[pm_qos_class]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != - extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[pm_qos_class]->target_value, - extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, - atomic_read(&pm_qos_array[pm_qos_class]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain( - pm_qos_array[pm_qos_class]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + /** * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us @@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request); * element as a handle for use in updating and removal. Call needs to save * this handle for later use. */ -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct pm_qos_request_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->pm_qos_class = pm_qos_class; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requests.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - - return dep; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * Attempts are made to make this code callable on hot code paths. */ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, - s32 new_value) + s32 new_value) { - unsigned long flags; - int pending_update = 0; s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; - if (pm_qos_req) { /*guard against callers passing in null */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->value) { - pending_update = 1; - pm_qos_req->value = temp; - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_req->pm_qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); */ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - int qos_class; + struct pm_qos_object *o; if (pm_qos_req == NULL) return; /* silent return to keep pcm code cleaner */ - qos_class = pm_qos_req->pm_qos_class; - spin_lock_irqsave(&pm_qos_lock, flags); - list_del(&pm_qos_req->list); - kfree(pm_qos_req); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *) pm_qos_add_request(pm_qos_class, - PM_QOS_DEFAULT_VALUE); + struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; if (filp->private_data) return 0; @@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) { struct pm_qos_request_list *req; - req = (struct pm_qos_request_list *)filp->private_data; + req = filp->private_data; pm_qos_remove_request(req); + kfree(req); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0513900995c..6842eeba587 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - struct sighand_struct *sighand; - struct signal_struct *sig; + struct signal_struct *sig = tsk->signal; struct task_struct *t; - *times = INIT_CPUTIME; + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (!sighand) + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) goto out; - sig = tsk->signal; - t = tsk; do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += t->se.sum_exec_runtime; - - t = next_thread(t); - } while (t != tsk); - - times->utime = cputime_add(times->utime, sig->utime); - times->stime = cputime_add(times->stime, sig->stime); - times->sum_exec_runtime += sig->sum_sched_runtime; + } while_each_thread(tsk, t); out: rcu_read_unlock(); } @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { .utime = tsk->utime, @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - thread_group_cputimer(tsk, &group_sample); + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; - spin_lock(&tsk->sighand->siglock); + if (!lock_task_sighand(tsk, &flags)) + return; /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad723420acc..9ca4973f736 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } if (timer_event_spec) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) { error = -EFAULT; @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); if (error) goto out; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916da4d..c77963938bc 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. @@ -277,7 +277,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -288,8 +288,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -336,6 +338,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + hibernation_freeze_swap(); saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) @@ -511,18 +514,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b21fc..62b0bc6e498 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (sscanf(buf, "%lu", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -236,6 +290,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae29052ab..028a99598f4 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> +#include <linux/workqueue.h> /* * Timeout for stopping processes @@ -35,6 +36,7 @@ static int try_to_freeze_tasks(bool sig_only) struct task_struct *g, *p; unsigned long end_time; unsigned int todo; + bool wq_busy = false; struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; @@ -42,6 +44,10 @@ static int try_to_freeze_tasks(bool sig_only) do_gettimeofday(&start); end_time = jiffies + TIMEOUT; + + if (!sig_only) + freeze_workqueues_begin(); + while (true) { todo = 0; read_lock(&tasklist_lock); @@ -63,6 +69,12 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); + + if (!sig_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + if (!todo || time_after(jiffies, end_time)) break; @@ -86,8 +98,12 @@ static int try_to_freeze_tasks(bool sig_only) */ printk("\n"); printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); + "(%d tasks refusing to freeze, wq_busy=%d):\n", + elapsed_csecs / 100, elapsed_csecs % 100, + todo - wq_busy, wq_busy); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); @@ -157,6 +173,7 @@ void thaw_processes(void) oom_killer_enable(); printk("Restarting tasks ... "); + thaw_workqueues(); thaw_tasks(true); thaw_tasks(false); schedule(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 25ce010e9f8..5e7edfb05e6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -1086,6 +1086,7 @@ void swsusp_free(void) buffer = NULL; alloc_normal = 0; alloc_highmem = 0; + hibernation_thaw_swap(); } /* Helper functions used for the shrinking of memory. */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f37cb7dd440..7335952ee47 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb2177839..5d0059eed3e 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,7 +4,7 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -32,7 +32,7 @@ /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(get_swap_for_hibernation(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_for_hibernation(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) ext = container_of(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) - swap_free(swp_entry(swap, offset)); + swap_free_for_hibernation(swp_entry(swap, offset)); kfree(ext); } diff --git a/kernel/printk.c b/kernel/printk.c index 444b770c959..8fe465ac008 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -37,6 +37,8 @@ #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/uaccess.h> @@ -985,6 +987,32 @@ void resume_console(void) } /** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int __cpuinit console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DYING: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + acquire_console_sem(); + release_console_sem(); + } + return NOTIFY_OK; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has @@ -1371,7 +1399,7 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); -static int __init disable_boot_consoles(void) +static int __init printk_late_init(void) { struct console *con; @@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void) unregister_console(con); } } + hotcpu_notifier(console_cpu_notify, 0); return 0; } -late_initcall(disable_boot_consoles); +late_initcall(printk_late_init); #if defined CONFIG_PRINTK @@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) chars = logged_chars; spin_unlock_irqrestore(&logbuf_lock, flags); - if (logged_chars > end) { - s1 = log_buf + log_buf_len - logged_chars + end; - l1 = logged_chars - end; + if (chars > end) { + s1 = log_buf + log_buf_len - chars + end; + l1 = chars - end; s2 = log_buf; l2 = end; @@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason) s1 = ""; l1 = 0; - s2 = log_buf + end - logged_chars; - l2 = logged_chars; + s2 = log_buf + end - chars; + l2 = chars; } if (!spin_trylock_irqsave(&dump_list_lock, flags)) { diff --git a/kernel/range.c b/kernel/range.c index 74e2e611492..471b66acabb 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -7,10 +7,6 @@ #include <linux/range.h> -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { if (start >= end) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9567f..4d169835fb3 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -114,3 +114,163 @@ int rcu_my_thread_group_empty(void) } EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); #endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#else + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; +#endif + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3cd23..196ec02f8be 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -169,6 +169,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; } @@ -211,6 +212,7 @@ static void __call_rcu(struct rcu_head *head, { unsigned long flags; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a..2e2726d790b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -239,8 +239,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d4437345706..d5bc43976c5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1112,6 +1112,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1388,6 +1389,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; diff --git a/kernel/sched.c b/kernel/sched.c index f52a8801b7a..41541d79e3c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include <asm/irq_regs.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -456,9 +457,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -int nohz_ratelimit(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 diff = rq->clock - rq->nohz_stamp; - - rq->nohz_stamp = rq->clock; - - return diff < (NSEC_PER_SEC / HZ) >> 1; -} - #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { @@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2399,6 +2431,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); calc_load_account_active(this_rq); } @@ -3426,7 +3574,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); @@ -3598,7 +3746,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3611,11 +3758,26 @@ need_resched_nonpreemptible: raw_spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3637,8 +3799,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3647,11 +3811,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -3726,7 +3887,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -3738,9 +3899,9 @@ asmlinkage void __sched preempt_schedule(void) return; do { - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -4441,12 +4602,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6096,7 +6277,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7356,10 +7543,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7604,6 +7789,9 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7617,6 +7805,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7661,8 +7853,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb..52f1a149bfb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar <mingo@redhat.com> * Guillaume Chazarain <guichaz@gmail.com> * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc8..2722dc1b413 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp) { - gfp_t gfp = GFP_KERNEL; int i; - if (bootmem) - gfp = GFP_NOWAIT; - memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) raw_spin_lock_init(&vec->lock); vec->count = 0; - if (!zalloc_cpumask_var(&vec->mask, gfp)) + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95b..9fc7d386fea 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -27,7 +27,7 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00..2e1b0d17dd9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); + P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332da..806d1b227a2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 > group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu) { - *balance = 0; - return; + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } - update_group_power(sd, this_cpu); - /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; @@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; } /** @@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try + * first, lower the sg capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) @@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; @@ -2726,8 +2850,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2854,7 +2992,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2898,7 +3036,8 @@ redo: schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3093,13 +3232,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif /* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3385,11 +3569,102 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (!rq->nr_running) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c..d10c80ebb67 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; - if (!p->signal) - return; - /* max may change after cur was read, this will be fixed next tick */ soft = task_rlimit(p, RLIMIT_RTTIME); hard = task_rlimit_max(p, RLIMIT_RTTIME); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b..25c2f962f6f 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer; - struct signal_struct *sig; - - sig = tsk->signal; - /* see __exit_signal()->task_rq_unlock_wait() */ - barrier(); - if (unlikely(!sig)) - return; - - cputimer = &sig->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; diff --git a/kernel/signal.c b/kernel/signal.c index 906ae5a1779..bded6518778 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) @@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c4364529..00000000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Slow work debugging - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/seq_file.h> -#include "slow-work.h" - -#define ITERATOR_SHIFT (BITS_PER_LONG - 4) -#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) -#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) - -void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) -{ - seq_puts(m, "Slow-work: New thread"); -} - -/* - * Render the time mark field on a work item into a 5-char time with units plus - * a space - */ -static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) -{ - struct timespec now, diff; - - now = CURRENT_TIME; - diff = timespec_sub(now, work->mark); - - if (diff.tv_sec < 0) - seq_puts(m, " -ve "); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) - seq_printf(m, "%3luns ", diff.tv_nsec); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) - seq_printf(m, "%3luus ", diff.tv_nsec / 1000); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) - seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); - else if (diff.tv_sec <= 1) - seq_puts(m, " 1s "); - else if (diff.tv_sec < 60) - seq_printf(m, "%4lus ", diff.tv_sec); - else if (diff.tv_sec < 60 * 60) - seq_printf(m, "%4lum ", diff.tv_sec / 60); - else if (diff.tv_sec < 60 * 60 * 24) - seq_printf(m, "%4luh ", diff.tv_sec / 3600); - else - seq_puts(m, "exces "); -} - -/* - * Describe a slow work item for debugfs - */ -static int slow_work_runqueue_show(struct seq_file *m, void *v) -{ - struct slow_work *work; - struct list_head *p = v; - unsigned long id; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); - return 0; - case 2: - seq_puts(m, "=== ===== ================ == ===== ==========\n"); - return 0; - - case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: - id = (unsigned long) v - 3; - - read_lock(&slow_work_execs_lock); - work = slow_work_execs[id]; - if (work) { - smp_read_barrier_depends(); - - seq_printf(m, "%3lu %5d %16p %2lx ", - id, slow_work_pids[id], work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - } - read_unlock(&slow_work_execs_lock); - return 0; - - default: - work = list_entry(p, struct slow_work, link); - seq_printf(m, "%3s - %16p %2lx ", - work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", - work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - return 0; - } -} - -/* - * map the iterator to a work item - */ -static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) -{ - struct list_head *p; - unsigned long count, id; - - switch (*_pos >> ITERATOR_SHIFT) { - case 0x0: - if (*_pos == 0) - *_pos = 1; - if (*_pos < 3) - return (void *)(unsigned long) *_pos; - if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) - for (id = *_pos - 3; - id < SLOW_WORK_THREAD_LIMIT; - id++, (*_pos)++) - if (slow_work_execs[id]) - return (void *)(unsigned long) *_pos; - *_pos = 0x1UL << ITERATOR_SHIFT; - - case 0x1: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &slow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - - case 0x2: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &vslow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) -{ - spin_lock_irq(&slow_work_queue_lock); - return slow_work_runqueue_index(m, _pos); -} - -/* - * move to the next line - */ -static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) -{ - struct list_head *p = v; - unsigned long selector = *_pos >> ITERATOR_SHIFT; - - (*_pos)++; - switch (selector) { - case 0x0: - return slow_work_runqueue_index(m, _pos); - - case 0x1: - if (*_pos >> ITERATOR_SHIFT == 0x1) { - p = p->next; - if (p != &slow_work_queue) - return p; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - p = &vslow_work_queue; - - case 0x2: - if (*_pos >> ITERATOR_SHIFT == 0x2) { - p = p->next; - if (p != &vslow_work_queue) - return p; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * clean up after reading - */ -static void slow_work_runqueue_stop(struct seq_file *m, void *v) -{ - spin_unlock_irq(&slow_work_queue_lock); -} - -static const struct seq_operations slow_work_runqueue_ops = { - .start = slow_work_runqueue_start, - .stop = slow_work_runqueue_stop, - .next = slow_work_runqueue_next, - .show = slow_work_runqueue_show, -}; - -/* - * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents - */ -static int slow_work_runqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slow_work_runqueue_ops); -} - -const struct file_operations slow_work_runqueue_fops = { - .owner = THIS_MODULE, - .open = slow_work_runqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa9ef4..00000000000 --- a/kernel/slow-work.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* Worker thread pool for slow items, such as filesystem lookups or mkdirs - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - * - * See Documentation/slow-work.txt - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/wait.h> -#include <linux/debugfs.h> -#include "slow-work.h" - -static void slow_work_cull_timeout(unsigned long); -static void slow_work_oom_timeout(unsigned long); - -#ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -static int slow_work_max_threads_sysctl(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -#endif - -/* - * The pool of threads has at least min threads in it as long as someone is - * using the facility, and may have as many as max. - * - * A portion of the pool may be processing very slow operations. - */ -static unsigned slow_work_min_threads = 2; -static unsigned slow_work_max_threads = 4; -static unsigned vslow_work_proportion = 50; /* % of threads that may process - * very slow work */ - -#ifdef CONFIG_SYSCTL -static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; -static const int slow_work_min_vslow = 1; -static const int slow_work_max_vslow = 99; - -ctl_table slow_work_sysctls[] = { - { - .procname = "min-threads", - .data = &slow_work_min_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_min_threads_sysctl, - .extra1 = (void *) &slow_work_min_min_threads, - .extra2 = &slow_work_max_threads, - }, - { - .procname = "max-threads", - .data = &slow_work_max_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_max_threads_sysctl, - .extra1 = &slow_work_min_threads, - .extra2 = (void *) &slow_work_max_max_threads, - }, - { - .procname = "vslow-percentage", - .data = &vslow_work_proportion, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &slow_work_min_vslow, - .extra2 = (void *) &slow_work_max_vslow, - }, - {} -}; -#endif - -/* - * The active state of the thread pool - */ -static atomic_t slow_work_thread_count; -static atomic_t vslow_work_executing_count; - -static bool slow_work_may_not_start_new_thread; -static bool slow_work_cull; /* cull a thread due to lack of activity */ -static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); -static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); -static struct slow_work slow_work_new_thread; /* new thread starter */ - -/* - * slow work ID allocation (use slow_work_queue_lock) - */ -static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - -/* - * Unregistration tracking to prevent put_ref() from disappearing during module - * unload - */ -#ifdef CONFIG_MODULES -static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; -static struct module *slow_work_unreg_module; -static struct slow_work *slow_work_unreg_work_item; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); -static DEFINE_MUTEX(slow_work_unreg_sync_lock); - -static void slow_work_set_thread_processing(int id, struct slow_work *work) -{ - if (work) - slow_work_thread_processing[id] = work->owner; -} -static void slow_work_done_thread_processing(int id, struct slow_work *work) -{ - struct module *module = slow_work_thread_processing[id]; - - slow_work_thread_processing[id] = NULL; - smp_mb(); - if (slow_work_unreg_work_item == work || - slow_work_unreg_module == module) - wake_up_all(&slow_work_unreg_wq); -} -static void slow_work_clear_thread_processing(int id) -{ - slow_work_thread_processing[id] = NULL; -} -#else -static void slow_work_set_thread_processing(int id, struct slow_work *work) {} -static void slow_work_done_thread_processing(int id, struct slow_work *work) {} -static void slow_work_clear_thread_processing(int id) {} -#endif - -/* - * Data for tracking currently executing items for indication through /proc - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; -pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; -DEFINE_RWLOCK(slow_work_execs_lock); -#endif - -/* - * The queues of work items and the lock governing access to them. These are - * shared between all the CPUs. It doesn't make sense to have per-CPU queues - * as the number of threads bears no relation to the number of CPUs. - * - * There are two queues of work items: one for slow work items, and one for - * very slow work items. - */ -LIST_HEAD(slow_work_queue); -LIST_HEAD(vslow_work_queue); -DEFINE_SPINLOCK(slow_work_queue_lock); - -/* - * The following are two wait queues that get pinged when a work item is placed - * on an empty queue. These allow work items that are hogging a thread by - * sleeping in a way that could be deferred to yield their thread and enqueue - * themselves. - */ -static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); -static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); - -/* - * The thread controls. A variable used to signal to the threads that they - * should exit when the queue is empty, a waitqueue used by the threads to wait - * for signals, and a completion set by the last thread to exit. - */ -static bool slow_work_threads_should_exit; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); -static DECLARE_COMPLETION(slow_work_last_thread_exited); - -/* - * The number of users of the thread pool and its lock. Whilst this is zero we - * have no threads hanging around, and when this reaches zero, we wait for all - * active or queued work items to complete and kill all the threads we do have. - */ -static int slow_work_user_count; -static DEFINE_MUTEX(slow_work_user_lock); - -static inline int slow_work_get_ref(struct slow_work *work) -{ - if (work->ops->get_ref) - return work->ops->get_ref(work); - - return 0; -} - -static inline void slow_work_put_ref(struct slow_work *work) -{ - if (work->ops->put_ref) - work->ops->put_ref(work); -} - -/* - * Calculate the maximum number of active threads in the pool that are - * permitted to process very slow work items. - * - * The answer is rounded up to at least 1, but may not equal or exceed the - * maximum number of the threads in the pool. This means we always have at - * least one thread that can process slow work items, and we always have at - * least one thread that won't get tied up doing so. - */ -static unsigned slow_work_calc_vsmax(void) -{ - unsigned vsmax; - - vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; - vsmax /= 100; - vsmax = max(vsmax, 1U); - return min(vsmax, slow_work_max_threads - 1); -} - -/* - * Attempt to execute stuff queued on a slow thread. Return true if we managed - * it, false if there was nothing to do. - */ -static noinline bool slow_work_execute(int id) -{ - struct slow_work *work = NULL; - unsigned vsmax; - bool very_slow; - - vsmax = slow_work_calc_vsmax(); - - /* see if we can schedule a new thread to be started if we're not - * keeping up with the work */ - if (!waitqueue_active(&slow_work_thread_wq) && - (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && - atomic_read(&slow_work_thread_count) < slow_work_max_threads && - !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - - /* find something to execute */ - spin_lock_irq(&slow_work_queue_lock); - if (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax) { - work = list_entry(vslow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - atomic_inc(&vslow_work_executing_count); - very_slow = true; - } else if (!list_empty(&slow_work_queue)) { - work = list_entry(slow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - very_slow = false; - } else { - very_slow = false; /* avoid the compiler warning */ - } - - slow_work_set_thread_processing(id, work); - if (work) { - slow_work_mark_time(work); - slow_work_begin_exec(id, work); - } - - spin_unlock_irq(&slow_work_queue_lock); - - if (!work) - return false; - - if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) - BUG(); - - /* don't execute if the work is in the process of being cancelled */ - if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) - work->ops->execute(work); - - if (very_slow) - atomic_dec(&vslow_work_executing_count); - clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); - - /* wake up anyone waiting for this work to be complete */ - wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); - - slow_work_end_exec(id, work); - - /* if someone tried to enqueue the item whilst we were executing it, - * then it'll be left unenqueued to avoid multiple threads trying to - * execute it simultaneously - * - * there is, however, a race between us testing the pending flag and - * getting the spinlock, and between the enqueuer setting the pending - * flag and getting the spinlock, so we use a deferral bit to tell us - * if the enqueuer got there first - */ - if (test_bit(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irq(&slow_work_queue_lock); - - if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && - test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) - goto auto_requeue; - - spin_unlock_irq(&slow_work_queue_lock); - } - - /* sort out the race between module unloading and put_ref() */ - slow_work_put_ref(work); - slow_work_done_thread_processing(id, work); - - return true; - -auto_requeue: - /* we must complete the enqueue operation - * - we transfer our ref on the item back to the appropriate queue - * - don't wake another thread up as we're awake already - */ - slow_work_mark_time(work); - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); - spin_unlock_irq(&slow_work_queue_lock); - slow_work_clear_thread_processing(id); - return true; -} - -/** - * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work - * work: The work item under execution that wants to sleep - * _timeout: Scheduler sleep timeout - * - * Allow a requeueable work item to sleep on a slow-work processor thread until - * that thread is needed to do some other work or the sleep is interrupted by - * some other event. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * False is returned if there is nothing on the queue; true is returned if the - * work item should be requeued - */ -bool slow_work_sleep_till_thread_needed(struct slow_work *work, - signed long *_timeout) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - - DEFINE_WAIT(wait); - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - if (!list_empty(queue)) - return true; - - add_wait_queue_exclusive(wfo_wq, &wait); - if (list_empty(queue)) - *_timeout = schedule_timeout(*_timeout); - finish_wait(wfo_wq, &wait); - - return !list_empty(queue); -} -EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); - -/** - * slow_work_enqueue - Schedule a slow work item for processing - * @work: The work item to queue - * - * Schedule a slow work item for processing. If the item is already undergoing - * execution, this guarantees not to re-enter the execution routine until the - * first execution finishes. - * - * The item is pinned by this function as it retains a reference to it, managed - * through the item operations. The item is unpinned once it has been - * executed. - * - * An item may hog the thread that is running it for a relatively large amount - * of time, sufficient, for example, to perform several lookup, mkdir, create - * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. - * - * Conversely, if a number of items are awaiting processing, it may take some - * time before any given item is given attention. The number of threads in the - * pool may be increased to deal with demand, but only up to a limit. - * - * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in - * the very slow queue, from which only a portion of the threads will be - * allowed to pick items to execute. This ensures that very slow items won't - * overly block ones that are just ordinarily slow. - * - * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is - * attempted queued) - */ -int slow_work_enqueue(struct slow_work *work) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - unsigned long flags; - int ret; - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - /* when honouring an enqueue request, we only promise that we will run - * the work function in the future; we do not promise to run it once - * per enqueue request - * - * we use the PENDING bit to merge together repeat requests without - * having to disable IRQs and take the spinlock, whilst still - * maintaining our promise - */ - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) - goto cancelled; - - /* we promise that we will not attempt to execute the work - * function in more than one thread simultaneously - * - * this, however, leaves us with a problem if we're asked to - * enqueue the work whilst someone is executing the work - * function as simply queueing the work immediately means that - * another thread may try executing it whilst it is already - * under execution - * - * to deal with this, we set the ENQ_DEFERRED bit instead of - * enqueueing, and the thread currently executing the work - * function will enqueue the work item when the work function - * returns and it has cleared the EXECUTING bit - */ - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - } else { - ret = slow_work_get_ref(work); - if (ret < 0) - goto failed; - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - wake_up(&slow_work_thread_wq); - - /* if someone who could be requeued is sleeping on a - * thread, then ask them to yield their thread */ - if (work->link.prev == queue) - wake_up(wfo_wq); - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - return 0; - -cancelled: - ret = -ECANCELED; -failed: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(slow_work_enqueue); - -static int slow_work_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * slow_work_cancel - Cancel a slow work item - * @work: The work item to cancel - * - * This function will cancel a previously enqueued work item. If we cannot - * cancel the work item, it is guarenteed to have run when this function - * returns. - */ -void slow_work_cancel(struct slow_work *work) -{ - bool wait = true, put = false; - - set_bit(SLOW_WORK_CANCELLING, &work->flags); - smp_mb(); - - /* if the work item is a delayed work item with an active timer, we - * need to wait for the timer to finish _before_ getting the spinlock, - * lest we deadlock against the timer routine - * - * the timer routine will leave DELAYED set if it notices the - * CANCELLING flag in time - */ - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - del_timer_sync(&dwork->timer); - } - - spin_lock_irq(&slow_work_queue_lock); - - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - /* the timer routine aborted or never happened, so we are left - * holding the timer's reference on the item and should just - * drop the pending flag and wait for any ongoing execution to - * finish */ - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - - BUG_ON(timer_pending(&dwork->timer)); - BUG_ON(!list_empty(&work->link)); - - clear_bit(SLOW_WORK_DELAYED, &work->flags); - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && - !list_empty(&work->link)) { - /* the link in the pending queue holds a reference on the item - * that we will need to release */ - list_del_init(&work->link); - wait = false; - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { - /* the executor is holding our only reference on the item, so - * we merely need to wait for it to finish executing */ - clear_bit(SLOW_WORK_PENDING, &work->flags); - } - - spin_unlock_irq(&slow_work_queue_lock); - - /* the EXECUTING flag is set by the executor whilst the spinlock is set - * and before the item is dequeued - so assuming the above doesn't - * actually dequeue it, simply waiting for the EXECUTING flag to be - * released here should be sufficient */ - if (wait) - wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, - TASK_UNINTERRUPTIBLE); - - clear_bit(SLOW_WORK_CANCELLING, &work->flags); - if (put) - slow_work_put_ref(work); -} -EXPORT_SYMBOL(slow_work_cancel); - -/* - * Handle expiry of the delay timer, indicating that a delayed slow work item - * should now be queued if not cancelled - */ -static void delayed_slow_work_timer(unsigned long data) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - struct slow_work *work = (struct slow_work *) data; - unsigned long flags; - bool queued = false, put = false, first = false; - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { - clear_bit(SLOW_WORK_DELAYED, &work->flags); - - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - /* we discard the reference the timer was holding in - * favour of the one the executor holds */ - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - put = true; - } else { - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - queued = true; - if (work->link.prev == queue) - first = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - if (put) - slow_work_put_ref(work); - if (first) - wake_up(wfo_wq); - if (queued) - wake_up(&slow_work_thread_wq); -} - -/** - * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing - * @dwork: The delayed work item to queue - * @delay: When to start executing the work, in jiffies from now - * - * This is similar to slow_work_enqueue(), but it adds a delay before the work - * is actually queued for processing. - * - * The item can have delayed processing requested on it whilst it is being - * executed. The delay will begin immediately, and if it expires before the - * item finishes executing, the item will be placed back on the queue when it - * has done executing. - */ -int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, - unsigned long delay) -{ - struct slow_work *work = &dwork->work; - unsigned long flags; - int ret; - - if (delay == 0) - return slow_work_enqueue(&dwork->work); - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - goto cancelled; - - /* the timer holds a reference whilst it is pending */ - ret = slow_work_get_ref(work); - if (ret < 0) - goto cant_get_ref; - - if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) - BUG(); - dwork->timer.expires = jiffies + delay; - dwork->timer.data = (unsigned long) work; - dwork->timer.function = delayed_slow_work_timer; - add_timer(&dwork->timer); - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - - return 0; - -cancelled: - ret = -ECANCELED; -cant_get_ref: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(delayed_slow_work_enqueue); - -/* - * Schedule a cull of the thread pool at some time in the near future - */ -static void slow_work_schedule_cull(void) -{ - mod_timer(&slow_work_cull_timer, - round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); -} - -/* - * Worker thread culling algorithm - */ -static bool slow_work_cull_thread(void) -{ - unsigned long flags; - bool do_cull = false; - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (slow_work_cull) { - slow_work_cull = false; - - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) { - slow_work_schedule_cull(); - do_cull = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return do_cull; -} - -/* - * Determine if there is slow work available for dispatch - */ -static inline bool slow_work_available(int vsmax) -{ - return !list_empty(&slow_work_queue) || - (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax); -} - -/* - * Worker thread dispatcher - */ -static int slow_work_thread(void *_data) -{ - int vsmax, id; - - DEFINE_WAIT(wait); - - set_freezable(); - set_user_nice(current, -5); - - /* allocate ourselves an ID */ - spin_lock_irq(&slow_work_queue_lock); - id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); - __set_bit(id, slow_work_ids); - slow_work_set_thread_pid(id, current->pid); - spin_unlock_irq(&slow_work_queue_lock); - - sprintf(current->comm, "kslowd%03u", id); - - for (;;) { - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); - if (!freezing(current) && - !slow_work_threads_should_exit && - !slow_work_available(vsmax) && - !slow_work_cull) - schedule(); - finish_wait(&slow_work_thread_wq, &wait); - - try_to_freeze(); - - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - if (slow_work_available(vsmax) && slow_work_execute(id)) { - cond_resched(); - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) - slow_work_schedule_cull(); - continue; - } - - if (slow_work_threads_should_exit) - break; - - if (slow_work_cull && slow_work_cull_thread()) - break; - } - - spin_lock_irq(&slow_work_queue_lock); - slow_work_set_thread_pid(id, 0); - __clear_bit(id, slow_work_ids); - spin_unlock_irq(&slow_work_queue_lock); - - if (atomic_dec_and_test(&slow_work_thread_count)) - complete_and_exit(&slow_work_last_thread_exited, 0); - return 0; -} - -/* - * Handle thread cull timer expiration - */ -static void slow_work_cull_timeout(unsigned long data) -{ - slow_work_cull = true; - wake_up(&slow_work_thread_wq); -} - -/* - * Start a new slow work thread - */ -static void slow_work_new_thread_execute(struct slow_work *work) -{ - struct task_struct *p; - - if (slow_work_threads_should_exit) - return; - - if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) - return; - - if (!mutex_trylock(&slow_work_user_lock)) - return; - - slow_work_may_not_start_new_thread = true; - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) { - printk(KERN_DEBUG "Slow work thread pool: OOM\n"); - if (atomic_dec_and_test(&slow_work_thread_count)) - BUG(); /* we're running on a slow work thread... */ - mod_timer(&slow_work_oom_timer, - round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); - } else { - /* ratelimit the starting of new threads */ - mod_timer(&slow_work_oom_timer, jiffies + 1); - } - - mutex_unlock(&slow_work_user_lock); -} - -static const struct slow_work_ops slow_work_new_thread_ops = { - .owner = THIS_MODULE, - .execute = slow_work_new_thread_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = slow_work_new_thread_desc, -#endif -}; - -/* - * post-OOM new thread start suppression expiration - */ -static void slow_work_oom_timeout(unsigned long data) -{ - slow_work_may_not_start_new_thread = false; -} - -#ifdef CONFIG_SYSCTL -/* - * Handle adjustment of the minimum number of threads - */ -static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to start or stop threads */ - n = atomic_read(&slow_work_thread_count) - - slow_work_min_threads; - - if (n < 0 && !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - else if (n > 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} - -/* - * Handle adjustment of the maximum number of threads - */ -static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to stop threads */ - n = slow_work_max_threads - - atomic_read(&slow_work_thread_count); - - if (n < 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} -#endif /* CONFIG_SYSCTL */ - -/** - * slow_work_register_user - Register a user of the facility - * @module: The module about to make use of the facility - * - * Register a user of the facility, starting up the initial threads if there - * aren't any other users at this point. This will return 0 if successful, or - * an error if not. - */ -int slow_work_register_user(struct module *module) -{ - struct task_struct *p; - int loop; - - mutex_lock(&slow_work_user_lock); - - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); - init_completion(&slow_work_last_thread_exited); - - slow_work_threads_should_exit = false; - slow_work_init(&slow_work_new_thread, - &slow_work_new_thread_ops); - slow_work_may_not_start_new_thread = false; - slow_work_cull = false; - - /* start the minimum number of threads */ - for (loop = 0; loop < slow_work_min_threads; loop++) { - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) - goto error; - } - printk(KERN_NOTICE "Slow work thread pool: Ready\n"); - } - - slow_work_user_count++; - mutex_unlock(&slow_work_user_lock); - return 0; - -error: - if (atomic_dec_and_test(&slow_work_thread_count)) - complete(&slow_work_last_thread_exited); - if (loop > 0) { - printk(KERN_ERR "Slow work thread pool:" - " Aborting startup on ENOMEM\n"); - slow_work_threads_should_exit = true; - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_ERR "Slow work thread pool: Aborted\n"); - } - mutex_unlock(&slow_work_user_lock); - return PTR_ERR(p); -} -EXPORT_SYMBOL(slow_work_register_user); - -/* - * wait for all outstanding items from the calling module to complete - * - note that more items may be queued whilst we're waiting - */ -static void slow_work_wait_for_items(struct module *module) -{ -#ifdef CONFIG_MODULES - DECLARE_WAITQUEUE(myself, current); - struct slow_work *work; - int loop; - - mutex_lock(&slow_work_unreg_sync_lock); - add_wait_queue(&slow_work_unreg_wq, &myself); - - for (;;) { - spin_lock_irq(&slow_work_queue_lock); - - /* first of all, we wait for the last queued item in each list - * to be processed */ - list_for_each_entry_reverse(work, &vslow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - list_for_each_entry_reverse(work, &slow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - - /* then we wait for the items being processed to finish */ - slow_work_unreg_module = module; - smp_mb(); - for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { - if (slow_work_thread_processing[loop] == module) - goto do_wait; - } - spin_unlock_irq(&slow_work_queue_lock); - break; /* okay, we're done */ - - do_wait: - spin_unlock_irq(&slow_work_queue_lock); - schedule(); - slow_work_unreg_work_item = NULL; - slow_work_unreg_module = NULL; - } - - remove_wait_queue(&slow_work_unreg_wq, &myself); - mutex_unlock(&slow_work_unreg_sync_lock); -#endif /* CONFIG_MODULES */ -} - -/** - * slow_work_unregister_user - Unregister a user of the facility - * @module: The module whose items should be cleared - * - * Unregister a user of the facility, killing all the threads if this was the - * last one. - * - * This waits for all the work items belonging to the nominated module to go - * away before proceeding. - */ -void slow_work_unregister_user(struct module *module) -{ - /* first of all, wait for all outstanding items from the calling module - * to complete */ - if (module) - slow_work_wait_for_items(module); - - /* then we can actually go about shutting down the facility if need - * be */ - mutex_lock(&slow_work_user_lock); - - BUG_ON(slow_work_user_count <= 0); - - slow_work_user_count--; - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); - slow_work_threads_should_exit = true; - del_timer_sync(&slow_work_cull_timer); - del_timer_sync(&slow_work_oom_timer); - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_NOTICE "Slow work thread pool:" - " Shut down complete\n"); - } - - mutex_unlock(&slow_work_user_lock); -} -EXPORT_SYMBOL(slow_work_unregister_user); - -/* - * Initialise the slow work facility - */ -static int __init init_slow_work(void) -{ - unsigned nr_cpus = num_possible_cpus(); - - if (slow_work_max_threads < nr_cpus) - slow_work_max_threads = nr_cpus; -#ifdef CONFIG_SYSCTL - if (slow_work_max_max_threads < nr_cpus * 2) - slow_work_max_max_threads = nr_cpus * 2; -#endif -#ifdef CONFIG_SLOW_WORK_DEBUG - { - struct dentry *dbdir; - - dbdir = debugfs_create_dir("slow_work", NULL); - if (dbdir && !IS_ERR(dbdir)) - debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, - NULL, &slow_work_runqueue_fops); - } -#endif - return 0; -} - -subsys_initcall(init_slow_work); diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1ef41..00000000000 --- a/kernel/slow-work.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Slow work private definitions - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ - -#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ - -/* - * slow-work.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern struct slow_work *slow_work_execs[]; -extern pid_t slow_work_pids[]; -extern rwlock_t slow_work_execs_lock; -#endif - -extern struct list_head slow_work_queue; -extern struct list_head vslow_work_queue; -extern spinlock_t slow_work_queue_lock; - -/* - * slow-work-debugfs.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern const struct file_operations slow_work_runqueue_fops; - -extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); -#endif - -/* - * Helper functions - */ -static inline void slow_work_set_thread_pid(int id, pid_t pid) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_pids[id] = pid; -#endif -} - -static inline void slow_work_mark_time(struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - work->mark = CURRENT_TIME; -#endif -} - -static inline void slow_work_begin_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_execs[id] = work; -#endif -} - -static inline void slow_work_end_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - write_lock(&slow_work_execs_lock); - slow_work_execs[id] = NULL; - write_unlock(&slow_work_execs_lock); -#endif -} diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f67dcb..00000000000 --- a/kernel/softlockup.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Detect Soft Lockups - * - * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. - * - * this code detects soft lockups: incidents in where on a CPU - * the kernel does not reschedule for 10 seconds or more. - */ -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/lockdep.h> -#include <linux/notifier.h> -#include <linux/module.h> -#include <linux/sysctl.h> - -#include <asm/irq_regs.h> - -static DEFINE_SPINLOCK(print_lock); - -static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ -static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); -static DEFINE_PER_CPU(bool, softlock_touch_sync); - -static int __read_mostly did_panic; -int __read_mostly softlockup_thresh = 60; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * soft-lockup occurs: - */ -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int -softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = softlock_panic, -}; - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void __touch_softlockup_watchdog(void) -{ - int this_cpu = raw_smp_processor_id(); - - __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); -} - -void touch_softlockup_watchdog(void) -{ - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_softlockup_watchdog_sync(void) -{ - __raw_get_cpu_var(softlock_touch_sync) = true; - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* Cause each CPU to re-update its timestamp rather than complain */ - for_each_online_cpu(cpu) - per_cpu(softlockup_touch_ts, cpu) = 0; -} -EXPORT_SYMBOL(touch_all_softlockup_watchdogs); - -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - -/* - * This callback runs from the timer interrupt, and checks - * whether the watchdog thread has hung or not: - */ -void softlockup_tick(void) -{ - int this_cpu = smp_processor_id(); - unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); - unsigned long print_ts; - struct pt_regs *regs = get_irq_regs(); - unsigned long now; - - /* Is detection switched off? */ - if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { - /* Be sure we don't false trigger if switched back on */ - if (touch_ts) - per_cpu(softlockup_touch_ts, this_cpu) = 0; - return; - } - - if (touch_ts == 0) { - if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - per_cpu(softlock_touch_sync, this_cpu) = false; - sched_clock_tick(); - } - __touch_softlockup_watchdog(); - return; - } - - print_ts = per_cpu(softlockup_print_ts, this_cpu); - - /* report at most once a second */ - if (print_ts == touch_ts || did_panic) - return; - - /* do not print during early bootup: */ - if (unlikely(system_state != SYSTEM_RUNNING)) { - __touch_softlockup_watchdog(); - return; - } - - now = get_timestamp(this_cpu); - - /* - * Wake up the high-prio watchdog task twice per - * threshold timespan. - */ - if (time_after(now - softlockup_thresh/2, touch_ts)) - wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); - - /* Warn about unreasonable delays: */ - if (time_before_eq(now - softlockup_thresh, touch_ts)) - return; - - per_cpu(softlockup_print_ts, this_cpu) = touch_ts; - - spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_ts, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - spin_unlock(&print_lock); - - if (softlockup_panic) - panic("softlockup: hung tasks"); -} - -/* - * The watchdog thread - runs every second and touches the timestamp. - */ -static int watchdog(void *__bind_cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_softlockup_watchdog(); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). - */ - while (!kthread_should_stop()) { - __touch_softlockup_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); - p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); - if (IS_ERR(p)) { - printk(KERN_ERR "watchdog for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - per_cpu(softlockup_touch_ts, hotcpu) = 0; - per_cpu(softlockup_watchdog, hotcpu) = p; - kthread_bind(p, hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(softlockup_watchdog, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(softlockup_watchdog, hotcpu), - cpumask_any(cpu_online_mask)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - p = per_cpu(softlockup_watchdog, hotcpu); - per_cpu(softlockup_watchdog, hotcpu) = NULL; - kthread_stop(p); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static int __init spawn_softlockup_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nosoftlockup) - return 0; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - - return 0; -} -early_initcall(spawn_softlockup_task); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 70f8d90331e..4372ccb2512 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,9 +35,9 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { spinlock_t lock; + bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ struct task_struct *thread; /* stopper thread */ - bool enabled; /* is this stopper enabled? */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea758ff..bad369ec540 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -181,3 +181,7 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); + +/* fanotify! */ +cond_syscall(sys_fanotify_init); +cond_syscall(sys_fanotify_mark); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761f487..ca38e8e3e90 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,16 +44,17 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> +#include <linux/dnotify.h> #include <linux/syscalls.h> #include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/slow-work.h> #include <linux/perf_event.h> #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> +#include <linux/oom.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -76,15 +77,16 @@ #include <scsi/sg.h> #endif +#ifdef CONFIG_LOCKUP_DETECTOR +#include <linux/nmi.h> +#endif + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -extern int sysctl_panic_on_oom; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_oom_dump_tasks; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -106,7 +108,7 @@ extern int blk_iopoll_enabled; #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; static int neg_one = -1; #endif @@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +#ifdef CONFIG_INOTIFY_USER +#include <linux/inotify.h> +#endif #ifdef CONFIG_SPARC #include <asm/system.h> #endif @@ -206,9 +211,6 @@ static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; extern struct ctl_table random_table[]; -#ifdef CONFIG_INOTIFY_USER -extern struct ctl_table inotify_table[]; -#endif #ifdef CONFIG_EPOLL extern struct ctl_table epoll_table[]; #endif @@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#ifdef CONFIG_HOTPLUG { .procname = "hotplug", .data = &uevent_helper, @@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog_enabled, + }, + { + .procname = "watchdog_thresh", + .data = &softlockup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog_thresh, + .extra1 = &neg_one, + .extra2 = &sixty, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, @@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dosoftlockup_thresh, - .extra1 = &neg_one, - .extra2 = &sixty, - }, -#endif #ifdef CONFIG_DETECT_HUNG_TASK { .procname = "hung_task_panic", @@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SLOW_WORK - { - .procname = "slow-work", - .mode = 0555, - .child = slow_work_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS { .procname = "perf_event_paranoid", diff --git a/kernel/time.c b/kernel/time.c index 848b1c2ab09..ba9b338d183 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed42951e0..f06a8a36564 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -6,7 +6,7 @@ config TICK_ONESHOT config NO_HZ bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables a tickless system: timer interrupts will @@ -15,7 +15,7 @@ config NO_HZ config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c1d56..c18d7efa1b4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) return max_nsecs - (max_nsecs >> 5); } -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET /** * clocksource_select - Select the best clocksource available @@ -577,7 +577,7 @@ static void clocksource_select(void) } } -#else /* CONFIG_GENERIC_TIME */ +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) #define MAX_UPDATE_LENGTH 5 /* Seconds */ /** - * __clocksource_register_scale - Used to install new clocksources + * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * - * Returns -EBUSY if registration fails, zero otherwise. + * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_register_hz() or clocksource_register_khz helper functions. + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. */ -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* * Ideally we want to use some of the limits used in * clocksource_max_deferment, to provide a more informed @@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) NSEC_PER_SEC/scale, MAX_UPDATE_LENGTH*scale); cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Intialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5fc66..48b2761b566 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 813993b5fb6..3e216e01bbd 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { + arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); - u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d4f5c..e14c839e9fa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static struct timespec xtime __attribute__ ((aligned (16))); +static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; /* @@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } -#ifdef CONFIG_GENERIC_TIME - /** * timekeeping_forward_now - update clock to the current time * @@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) tick_clock_notify(); } -#else /* GENERIC_TIME */ - -static inline void timekeeping_forward_now(void) { } - -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -#endif /* !GENERIC_TIME */ - /** * ktime_get_real - get the real (wall-) time in ktime_t format * @@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add_safe(xtime, ts); + xtime = timespec_add(xtime, ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add_safe(total_sleep_time, ts); + total_sleep_time = timespec_add(total_sleep_time, ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -784,10 +738,11 @@ void update_wall_time(void) return; clock = timekeeper.clock; -#ifdef CONFIG_GENERIC_TIME - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; -#else + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = timekeeper.cycle_interval; +#else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; @@ -856,7 +811,8 @@ void update_wall_time(void) } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } /** @@ -887,7 +843,7 @@ EXPORT_SYMBOL_GPL(getboottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add_safe(*ts, total_sleep_time); + *ts = timespec_add(*ts, total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -902,6 +858,11 @@ struct timespec __current_kernel_time(void) return xtime; } +struct timespec __get_wall_to_monotonic(void) +{ + return wall_to_monotonic; +} + struct timespec current_kernel_time(void) { struct timespec now; diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e1..f1b8afe1ad8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. */ #define TBASE_DEFERRABLE_FLAG (0x1) @@ -577,6 +582,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -679,12 +697,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); @@ -1289,7 +1303,6 @@ void run_local_timers(void) { hrtimer_run_queues(); raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); } /* @@ -1750,3 +1763,25 @@ unsigned long msleep_interruptible(unsigned int msecs) } EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c4545..538501c6ea5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -153,7 +153,7 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE @@ -175,7 +175,7 @@ config IRQSOFF_TRACER config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE @@ -194,15 +194,6 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER @@ -229,23 +220,6 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in initcall_debug and ftrace=initcall to the kernel - command line to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -325,28 +299,6 @@ config BRANCH_TRACER Say N if unsure. -config KSYM_TRACER - bool "Trace read and write access on kernel memory locations" - depends on HAVE_HW_BREAKPOINT - select TRACING - help - This tracer helps find read and write operations on any given kernel - symbol i.e. /proc/kallsyms. - -config PROFILE_KSYM_TRACER - bool "Profile all kernel memory accesses on 'watched' variables" - depends on KSYM_TRACER - help - This tracer profiles kernel accesses on variables watched through the - ksym tracer ftrace plugin. Depending upon the hardware, all read - and write operations on kernel variables can be monitored for - accesses. - - The results will be displayed in: - /debugfs/tracing/profile_ksym - - Say N if unsure. - config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -371,37 +323,6 @@ config STACK_TRACER Say N if unsure. -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - -config WORKQUEUE_TRACER - bool "Trace workqueues" - select GENERIC_TRACER - help - The workqueue tracer provides some statistical information - about each cpu workqueue thread such as the number of the - works inserted and executed since their creation. It can help - to evaluate the amount of work each of them has to perform. - For example it can help a developer to decide whether he should - choose a per-cpu workqueue instead of a singlethreaded one. - config BLK_DEV_IO_TRACE bool "Support for tracing block IO actions" depends on SYSFS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b0550..53f338190b2 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o -obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o @@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) @@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14f944..0d88ce9b9fb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1883,7 +1883,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) struct hlist_head *hhd; struct hlist_node *n; unsigned long key; - int resched; key = hash_long(ip, FTRACE_HASH_BITS); @@ -1897,12 +1896,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * period. This syncs the hash iteration and freeing of items * on the hash. rcu_read_lock is too dangerous here. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); hlist_for_each_entry_rcu(entry, n, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_probe_ops __read_mostly = diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb1660..00000000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - */ - -#include <linux/tracepoint.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/dcache.h> -#include <linux/fs.h> - -#include <linux/kmemtrace.h> - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void -kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(void *ignore, - unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree, NULL); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - unregister_trace_kfree(kmemtrace_kfree, NULL); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - kmemtrace_array = tr; - - tracing_reset_online_cpus(tr); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, - (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, - (unsigned long)entry->gfp_flags, entry->node); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", - entry->type_id, (void *)entry->call_site, - (unsigned long)entry->ptr); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - struct kmemtrace_user_event *ev; - struct kmemtrace_user_event_alloc *ev_alloc; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - struct kmemtrace_user_event *ev; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter) -{ - struct kmemtrace_alloc_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node and call site*/ - ret = trace_seq_printf(s, "%4d %pf\n", entry->node, - (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter) -{ - struct kmemtrace_free_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node and print call site*/ - ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return TRACE_TYPE_UNHANDLED; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: - return kmemtrace_print_alloc_compress(iter); - case TRACE_KMEM_FREE: - return kmemtrace_print_free_compress(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct trace_event_functions kmem_trace_alloc_funcs = { - .trace = kmemtrace_print_alloc, - .binary = kmemtrace_print_alloc_user, -}; - -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, - .funcs = &kmem_trace_alloc_funcs, -}; - -static struct trace_event_functions kmem_trace_free_funcs = { - .trace = kmemtrace_print_free, - .binary = kmemtrace_print_free_user, -}; - -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, - .funcs = &kmem_trace_free_funcs, -}; - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - if (!register_ftrace_event(&kmem_trace_alloc)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (!register_ftrace_event(&kmem_trace_free)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (register_tracer(&kmem_tracer) != 0) { - pr_warning("Warning: could not register the kmem tracer\n"); - return 1; - } - - return 0; -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6ea8b8..3632ce87674 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) */ struct ring_buffer_per_cpu { int cpu; + atomic_t record_disabled; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { unsigned long read; u64 write_stamp; u64 read_stamp; - atomic_t record_disabled; }; struct ring_buffer { @@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void) #endif -static DEFINE_PER_CPU(int, rb_need_resched); - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out_nocheck; @@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (!event) goto out; - /* - * Need to store resched state on this cpu. - * Only the first needs to. - */ - - if (preempt_count() == 1) - per_cpu(rb_need_resched, cpu) = resched; - return event; out: trace_recursive_unlock(); out_nocheck: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); @@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return 0; } @@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); @@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer, struct ring_buffer_event *event; void *body; int ret = -EBUSY; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; @@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d3631680..ba14a22be4c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_var_t __read_mostly tracing_buffer_mask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu(cpu, tracing_buffer_mask) +cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -428,6 +425,7 @@ static const char *trace_options[] = { "latency-format", "sleep-time", "graph-time", + "record-cmd", NULL }; @@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -729,18 +736,11 @@ __acquires(kernel_lock) return -1; } - if (strlen(type->name) > MAX_TRACER_SIZE) { + if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } - /* - * When this gets called we hold the BKL which means that - * preemption is disabled. Various trace selftests however - * need to disable and enable preemption for successful tests. - * So we drop the BKL here and grab it after the tests again. - */ - unlock_kernel(); mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -822,7 +822,6 @@ __acquires(kernel_lock) #endif out_unlock: - lock_kernel(); return ret; } @@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ -static void -ftrace_trace_special(void *__tr, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - int pc) -{ - struct ftrace_event_call *call = &event_special; - struct ring_buffer_event *event; - struct trace_array *tr = __tr; - struct ring_buffer *buffer = tr->buffer; - struct special_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, - sizeof(*entry), 0, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->arg1 = arg1; - entry->arg2 = arg2; - entry->arg3 = arg3; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (tracing_disabled) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, arg1, arg2, arg3, pc); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - /** * trace_vbprintk - write binary msg to tracing buffer * @@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct bprint_entry *entry; unsigned long flags; int disable; - int resched; int cpu, len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1452,7 +1395,7 @@ out_unlock: out: atomic_dec_return(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; - static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ @@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, } /* Find the next real entry, and increment the iterator to the next entry */ -static void *find_next_entry_inc(struct trace_iterator *iter) +void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); @@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return NULL; if (iter->idx < 0) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } -static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct trace_array *tr = iter->tr; struct ring_buffer_event *event; @@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) } /* Called with trace_event_read_lock() held. */ -static enum print_line_t print_trace_line(struct trace_iterator *iter) +enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, + .llseek = seq_lseek, }; /* @@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) @@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); } static ssize_t @@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) if (ret < 0) return ret; + if (!current_trace->use_max_tr) + goto out; + ret = ring_buffer_resize(max_tr.buffer, size); if (ret < 0) { int r; @@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } + max_tr.entries = size; + out: global_trace.entries = size; return ret; } + /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } destroy_trace_option_files(topts); current_trace = t; topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } if (t->init) { ret = tracer_init(t, tr); @@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (iter->trace->pipe_open) iter->trace->pipe_open(iter); + nonseekable_open(inode, filp); out: mutex_unlock(&trace_types_lock); return ret; @@ -3211,7 +3177,7 @@ waitagain: trace_event_read_lock(); trace_access_lock(iter->cpu_file); - while (find_next_entry_inc(iter) != NULL) { + while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; - if (!find_next_entry_inc(iter)) { + if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; @@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (ret <= 0) goto out_err; - if (!iter->ent && !find_next_entry_inc(iter)) { + if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } @@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } tracing_start(); - max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); return cnt; @@ -3590,18 +3555,21 @@ static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, + .llseek = generic_file_llseek, }; static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_pipe_fops = { @@ -3610,17 +3578,20 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, + .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, + .llseek = generic_file_llseek, }; static const struct file_operations trace_clock_fops = { @@ -3926,6 +3897,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic, .read = tracing_stats_read, + .llseek = generic_file_llseek, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -3962,6 +3934,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, }; #endif @@ -4115,6 +4088,7 @@ static const struct file_operations trace_options_fops = { .open = tracing_open_generic, .read = trace_options_read, .write = trace_options_write, + .llseek = generic_file_llseek, }; static ssize_t @@ -4166,6 +4140,7 @@ static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, + .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, @@ -4355,9 +4330,6 @@ static __init int tracer_init_debugfs(void) trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif create_trace_options_dir(); @@ -4414,7 +4386,7 @@ static struct notifier_block trace_die_notifier = { */ #define KERN_TRACE KERN_EMERG -static void +void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ @@ -4429,6 +4401,13 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + static void __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { @@ -4454,8 +4433,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + trace_init_global_iter(&iter); + for_each_tracing_cpu(cpu) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&iter.tr->data[cpu]->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4504,7 +4485,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (find_next_entry_inc(&iter) != NULL) { + if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); @@ -4526,7 +4507,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&iter.tr->data[cpu]->disabled); } tracing_on(); } @@ -4575,16 +4556,14 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = ring_buffer_size(max_tr.buffer); - WARN_ON(max_tr.entries != global_trace.entries); + max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ @@ -4597,9 +4576,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd96399463..d39b3c5454a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,10 +9,7 @@ #include <linux/mmiotrace.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> -#include <trace/boot.h> -#include <linux/kmemtrace.h> #include <linux/hw_breakpoint.h> - #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -25,30 +22,17 @@ enum trace_type { TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, - TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, TRACE_BLK, - TRACE_KSYM, __TRACE_LAST_TYPE, }; -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -extern struct tracer boot_tracer; #undef __field #define __field(type, item) type item; @@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ - IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -298,6 +274,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + int use_max_tr; }; @@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, const struct file_operations *fops); struct dentry *tracing_init_dentry(void); -void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; @@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); @@ -355,11 +339,6 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); -void trace_special(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -380,8 +359,15 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; -extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -452,12 +438,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sysprof(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_ksym(struct tracer *trace, - struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -471,6 +453,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; @@ -617,6 +601,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x20000, TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, }; /* @@ -628,54 +613,6 @@ enum trace_iterator_flags { extern struct tracer nop_trace; -/** - * ftrace_preempt_disable - disable preemption scheduler safe - * - * When tracing can happen inside the scheduler, there exists - * cases that the tracing might happen before the need_resched - * flag is checked. If this happens and the tracer calls - * preempt_enable (after a disable), a schedule might take place - * causing an infinite recursion. - * - * To prevent this, we read the need_resched flag before - * disabling preemption. When we want to enable preemption we - * check the flag, if it is set, then we call preempt_enable_no_resched. - * Otherwise, we call preempt_enable. - * - * The rational for doing the above is that if need_resched is set - * and we have yet to reschedule, we are either in an atomic location - * (where we do not need to check for scheduling) or we are inside - * the scheduler and do not want to resched. - */ -static inline int ftrace_preempt_disable(void) -{ - int resched; - - resched = need_resched(); - preempt_disable_notrace(); - - return resched; -} - -/** - * ftrace_preempt_enable - enable preemption scheduler safe - * @resched: the return value from ftrace_preempt_disable - * - * This is a scheduler safe way to enable preemption and not miss - * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we are either inside an atomic or - * are inside the scheduler (we would have already scheduled - * otherwise). In this case, we do not want to call normal - * preempt_enable, but preempt_enable_no_resched instead. - */ -static inline void ftrace_preempt_enable(int resched) -{ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); @@ -766,6 +703,8 @@ struct filter_pred { int pop_n; }; +extern struct list_head ftrace_common_fields; + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, @@ -795,6 +734,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +extern void trace_event_enable_cmd_record(bool enable); + extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3956a..00000000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/kallsyms.h> -#include <linux/time.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - boot_trace = tr; - - if (!tr) - return 0; - - tracing_reset_online_cpus(tr); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_call; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_ret; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1..685a67d55db 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -32,16 +32,15 @@ u64 notrace trace_clock_local(void) { u64 clock; - int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); clock = sched_clock(); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return clock; } @@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1240d..e3dfecaf13e 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, ); /* - * Special (free-form) trace entry: - */ -FTRACE_ENTRY(special, special_entry, - - TRACE_SPECIAL, - - F_STRUCT( - __field( unsigned long, arg1 ) - __field( unsigned long, arg2 ) - __field( unsigned long, arg3 ) - ), - - F_printk("(%08lx) (%08lx) (%08lx)", - __entry->arg1, __entry->arg2, __entry->arg3) -); - -/* * Stack-trace entry: */ @@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __entry->map_id, __entry->opcode) ); -FTRACE_ENTRY(boot_call, trace_boot_call, - - TRACE_BOOT_CALL, - - F_STRUCT( - __field_struct( struct boot_trace_call, boot_call ) - __field_desc( pid_t, boot_call, caller ) - __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) - ), - - F_printk("%d %s", __entry->caller, __entry->func) -); - -FTRACE_ENTRY(boot_ret, trace_boot_ret, - - TRACE_BOOT_RET, - - F_STRUCT( - __field_struct( struct boot_trace_ret, boot_ret ) - __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) - __field_desc( int, boot_ret, result ) - __field_desc( unsigned long, boot_ret, duration ) - ), - - F_printk("%s %d %lx", - __entry->func, __entry->result, __entry->duration) -); #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 @@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, - - TRACE_KMEM_ALLOC, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" - " flags:%x node:%d", - __entry->type_id, __entry->call_site, __entry->ptr, - __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, __entry->node) -); - -FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, - - TRACE_KMEM_FREE, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - F_printk("type:%u call_site:%lx ptr:%p", - __entry->type_id, __entry->call_site, __entry->ptr) -); - -FTRACE_ENTRY(ksym_trace, ksym_trace_entry, - - TRACE_KSYM, - - F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned char, type ) - __array( char , cmd, TASK_COMM_LEN ) - __field( unsigned long, addr ) - ), - - F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", - (void *)__entry->ip, (unsigned int)__entry->type, - (void *)__entry->addr, __entry->cmd) -); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f7c06..000e6e85b44 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,8 +9,6 @@ #include <linux/kprobes.h> #include "trace.h" -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); - static char *perf_trace_buf[4]; /* @@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - if (tp_event->class->reg) - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); - else - ret = tracepoint_probe_register(tp_event->name, - tp_event->class->perf_probe, - tp_event); - + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); if (ret) goto fail; @@ -96,9 +88,7 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && - (tp_event->class->perf_probe || - tp_event->class->reg) && + tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; @@ -138,18 +128,13 @@ void perf_trace_destroy(struct perf_event *p_event) if (--tp_event->perf_refcount > 0) goto out; - if (tp_event->class->reg) - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - else - tracepoint_probe_unregister(tp_event->name, - tp_event->class->perf_probe, - tp_event); + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); /* - * Ensure our callback won't be called anymore. See - * tracepoint_probe_unregister() and __DO_TRACE(). + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0b080..09b4fa6e4d3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,6 +28,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); struct list_head * trace_get_fields(struct ftrace_event_call *event_call) @@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } -int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed, - int filter_type) +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) { struct ftrace_event_field *field; - struct list_head *head; - - if (WARN_ON(!call->class)) - return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - head = trace_get_fields(call); list_add(&field->link, head); return 0; @@ -80,17 +76,32 @@ err: return -ENOMEM; } + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} EXPORT_SYMBOL_GPL(trace_define_field); #define __common_field(type, item) \ - ret = trace_define_field(call, #type, "common_" #item, \ - offsetof(typeof(ent), item), \ - sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -static int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(void) { int ret; struct trace_entry ent; @@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} + static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->flags & TRACE_EVENT_FL_ENABLED) { call->flags &= ~TRACE_EVENT_FL_ENABLED; - tracing_stop_cmdline_record(); - if (call->class->reg) - call->class->reg(call, TRACE_REG_UNREGISTER); - else - tracepoint_probe_unregister(call->name, - call->class->probe, - call); + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { - tracing_start_cmdline_record(); - if (call->class->reg) - ret = call->class->reg(call, TRACE_REG_REGISTER); - else - ret = tracepoint_probe_register(call->name, - call->class->probe, - call); + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } + ret = call->class->reg(call, TRACE_REG_REGISTER); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " @@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (match && @@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && (call->class->probe || call->class->reg)) + if (call->class && call->class->reg) return call; } @@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system) != 0) @@ -544,32 +598,10 @@ out: return ret; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +static void print_event_fields(struct trace_seq *s, struct list_head *head) { - struct ftrace_event_call *call = filp->private_data; struct ftrace_event_field *field; - struct list_head *head; - struct trace_seq *s; - int common_field_count = 5; - char *buf; - int r = 0; - - if (*ppos) - return 0; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - trace_seq_init(s); - - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); - head = trace_get_fields(call); list_for_each_entry_reverse(field, head, link) { /* * Smartly shows the array type(except dynamic array). @@ -584,29 +616,54 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, array_descriptor = NULL; if (!array_descriptor) { - r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", field->type, field->name, field->offset, field->size, !!field->is_signed); } else { - r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" "\tsize:%u;\tsigned:%d;\n", (int)(array_descriptor - field->type), field->type, field->name, array_descriptor, field->offset, field->size, !!field->is_signed); } + } +} - if (--common_field_count == 0) - r = trace_seq_printf(s, "\n"); +static ssize_t +event_format_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct list_head *head; + struct trace_seq *s; + char *buf; + int r; - if (!r) - break; - } + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + trace_seq_printf(s, "name: %s\n", call->name); + trace_seq_printf(s, "ID: %d\n", call->event.type); + trace_seq_printf(s, "format:\n"); + + /* print common fields */ + print_event_fields(s, &ftrace_common_fields); - if (r) - r = trace_seq_printf(s, "\nprint fmt: %s\n", - call->print_fmt); + trace_seq_putc(s, '\n'); + + /* print event specific fields */ + head = trace_get_fields(call); + print_event_fields(s, head); + + r = trace_seq_printf(s, "\nprint fmt: %s\n", call->print_fmt); if (!r) { /* @@ -963,35 +1020,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->probe || call->class->reg) + if (call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); #ifdef CONFIG_PERF_EVENTS - if (call->event.type && (call->class->perf_probe || call->class->reg)) + if (call->event.type && call->class->reg) trace_create_file("id", 0444, call->dir, call, id); #endif - if (call->class->define_fields) { - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->class->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; } - trace_create_file("filter", 0644, call->dir, call, - filter); } + trace_create_file("filter", 0644, call->dir, call, + filter); trace_create_file("format", 0444, call->dir, call, format); @@ -999,11 +1052,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } -static int __trace_add_event_call(struct ftrace_event_call *call) +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *d_events; int ret; + /* The linker may leave blanks */ if (!call->name) return -EINVAL; @@ -1011,8 +1070,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "events/%s\n", call->name); + pr_warning("Could not initialize trace events/%s\n", + call->name); return ret; } } @@ -1021,11 +1080,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!d_events) return -ENOENT; - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, id, enable, filter, format); if (!ret) list_add(&call->list, &ftrace_events); + call->mod = mod; return ret; } @@ -1035,7 +1093,10 @@ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); mutex_unlock(&event_mutex); return ret; } @@ -1152,8 +1213,6 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; - struct dentry *d_events; - int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1161,38 +1220,14 @@ static void trace_module_add_events(struct module *mod) if (start == end) return; - d_events = event_trace_events_dir(); - if (!d_events) + file_ops = trace_create_file_ops(mod); + if (!file_ops) return; for_each_event(call, start, end) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - /* - * This module has events, create file ops for this module - * if not already done. - */ - if (!file_ops) { - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - } - call->mod = mod; - ret = event_create_dir(call, d_events, + __trace_add_event_call(call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); - if (!ret) - list_add(&call->list, &ftrace_events); } } @@ -1319,25 +1354,14 @@ static __init int event_trace_init(void) trace_create_file("enable", 0644, d_events, NULL, &ftrace_system_enable_fops); + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + __trace_add_event_call(call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - if (!ret) - list_add(&call->list, &ftrace_events); } while (true) { @@ -1524,12 +1548,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) struct ftrace_entry *entry; unsigned long flags; long disabled; - int resched; int cpu; int pc; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1551,7 +1574,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb3299..36d40104b17 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, } static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +__find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; - struct list_head *head; - head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; @@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static void filter_free_pred(struct filter_pred *pred) { if (!pred) @@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a6596..4ba44deaac2 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" -static int ftrace_raw_init_event(struct ftrace_event_call *call) -{ - INIT_LIST_HEAD(&call->class->fields); - return 0; -} - #undef __entry #define __entry REC @@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ - .raw_init = ftrace_raw_init_event, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ struct ftrace_event_call __used \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776b0cd..16aee4d44e8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu, resched; + int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac99a9..6bff2362578 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -641,7 +641,8 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", + nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e0cef..73a6b0601f2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -649,6 +649,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -681,6 +682,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -715,6 +717,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 00000000000..7b8ecd751d9 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,136 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "../debug/kdb/kdb_private.h" +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f50299..8b27c9849b4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,6 +30,8 @@ #include <linux/ptrace.h> #include <linux/perf_event.h> #include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -38,6 +40,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX #define KPROBE_EVENT_SYSTEM "kprobes" /* Reserved field names */ @@ -58,14 +61,16 @@ const char *reserved_field_names[] = { }; /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); #define PRINT_TYPE_FUNC_NAME(type) print_type_##type #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, void *data)\ + const char *name, \ + void *data, void *ent)\ { \ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ } \ @@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); @@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, return fprm->fn(regs, fprm->data, dest); } -#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. */ -#define DEFINE_BASIC_FETCH_FUNCS(kind) \ -DEFINE_FETCH_##kind(u8) \ -DEFINE_FETCH_##kind(u16) \ -DEFINE_FETCH_##kind(u32) \ -DEFINE_FETCH_##kind(u64) - -#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ - ((FETCH_FUNC_NAME(kind, u8) == fn) || \ - (FETCH_FUNC_NAME(kind, u16) == fn) || \ - (FETCH_FUNC_NAME(kind, u32) == fn) || \ - (FETCH_FUNC_NAME(kind, u64) == fn)) +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ + void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL #define DEFINE_FETCH_stack(type) \ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ @@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL #define DEFINE_FETCH_retval(type) \ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ @@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ *(type *)dest = (type)regs_return_value(regs); \ } DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL #define DEFINE_FETCH_memory(type) \ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ @@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = retval; \ } DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} /* Memory fetching by symbol */ struct symbol_cache { @@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) /* Dereference memory access function */ struct deref_fetch_param { @@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } @@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(kind, type) \ - .kind = FETCH_FUNC_NAME(kind, type) - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - {.name = #ptype, \ - .size = sizeof(ftype), \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } \ } +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + /* Fetch type information table */ static const struct fetch_type { const char *name; /* Name of type */ @@ -264,14 +404,16 @@ static const struct fetch_type { int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ /* Fetch functions */ - fetch_func_t reg; - fetch_func_t stack; - fetch_func_t retval; - fetch_func_t memory; - fetch_func_t symbol; - fetch_func_t deref; + fetch_func_t fetch[FETCH_MTD_END]; } fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), ASSIGN_FETCH_TYPE(u32, u32, 0), @@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, *(unsigned long *)dest = kernel_stack_pointer(regs); } +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + /** * Kprobe event core functions */ struct probe_arg { struct fetch_param fetch; + struct fetch_param fetch_size; unsigned int offset; /* Offset from argument entry */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ @@ -429,9 +587,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); kfree(arg->name); kfree(arg->comm); @@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (is_return) - f->fn = t->retval; + f->fn = t->fetch[FETCH_MTD_retval]; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { @@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - f->fn = t->stack; + f->fn = t->fetch[FETCH_MTD_stack]; f->data = (void *)param; } } else @@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->reg; + f->fn = t->fetch[FETCH_MTD_reg]; f->data = (void *)(unsigned long)ret; ret = 0; } @@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - f->fn = t->memory; + f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); @@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, break; f->data = alloc_symbol_cache(arg + 1, offset); if (f->data) - f->fn = t->symbol; + f->fn = t->fetch[FETCH_MTD_symbol]; } break; case '+': /* deref memory */ @@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) kfree(dprm); else { - f->fn = t->deref; + f->fn = t->fetch[FETCH_MTD_deref]; f->data = (void *)dprm; } } break; } - if (!ret && !f->fn) + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); ret = -EINVAL; + } return ret; } @@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) { const char *t; + int ret; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, } parg->offset = tp->size; tp->size += parg->type->size; - return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; } /* Return 1 if name is reserved or already used by another argument */ @@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) pr_info("Delete command needs an event name.\n"); return -EINVAL; } + mutex_lock(&probe_lock); tp = find_probe_event(event, group); if (!tp) { + mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ unregister_trace_probe(tp); free_trace_probe(tp); + mutex_unlock(&probe_lock); return 0; } @@ -1043,6 +1214,54 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + /* Kprobe handler */ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { @@ -1050,8 +1269,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, dsize, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -1060,7 +1278,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1069,9 +1288,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) entry = ring_buffer_event_data(event); entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1085,15 +1302,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, pc, dsize; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1103,9 +1320,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1137,7 +1352,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1179,7 +1394,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1214,11 +1429,6 @@ static void probe_event_disable(struct ftrace_event_call *call) } } -static int probe_event_raw_init(struct ftrace_event_call *event_call) -{ - return 0; -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1239,7 +1449,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1261,7 +1471,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1301,8 +1511,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); } #undef LEN_OR_ZERO @@ -1339,11 +1554,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1355,9 +1570,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, return; entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); @@ -1371,11 +1585,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1388,9 +1602,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); @@ -1486,15 +1698,12 @@ static int register_probe_event(struct trace_probe *tp) int ret; /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); if (probe_is_return(tp)) { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kretprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kretprobe_event_define_fields; } else { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf00749b6..00000000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - * trace_ksym.c - Kernel Symbol Tracer - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - */ - -#include <linux/kallsyms.h> -#include <linux/uaccess.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/fs.h> - -#include "trace_output.h" -#include "trace.h" - -#include <linux/hw_breakpoint.h> -#include <asm/hw_breakpoint.h> - -#include <asm/atomic.h> - -#define KSYM_TRACER_OP_LEN 3 /* rw- */ - -struct trace_ksym { - struct perf_event **ksym_hbp; - struct perf_event_attr attr; -#ifdef CONFIG_PROFILE_KSYM_TRACER - atomic64_t counter; -#endif - struct hlist_node ksym_hlist; -}; - -static struct trace_array *ksym_trace_array; - -static unsigned int ksym_tracing_enabled; - -static HLIST_HEAD(ksym_filter_head); - -static DEFINE_MUTEX(ksym_tracer_mutex); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - -#define MAX_UL_INT 0xffffffff - -void ksym_collect_stats(unsigned long hbp_hit_addr) -{ - struct hlist_node *node; - struct trace_ksym *entry; - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == hbp_hit_addr) { - atomic64_inc(&entry->counter); - break; - } - } - rcu_read_unlock(); -} -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -void ksym_hbp_handler(struct perf_event *hbp, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct ring_buffer_event *event; - struct ksym_trace_entry *entry; - struct ring_buffer *buffer; - int pc; - - if (!ksym_tracing_enabled) - return; - - buffer = ksym_trace_array->buffer; - - pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, - sizeof(*entry), 0, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(regs); - entry->type = hw_breakpoint_type(hbp); - entry->addr = hw_breakpoint_addr(hbp); - strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - ksym_collect_stats(hw_breakpoint_addr(hbp)); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -/* Valid access types are represented as - * - * rw- : Set Read/Write Access Breakpoint - * -w- : Set Write Access Breakpoint - * --- : Clear Breakpoints - * --x : Set Execution Break points (Not available yet) - * - */ -static int ksym_trace_get_access_type(char *str) -{ - int access = 0; - - if (str[0] == 'r') - access |= HW_BREAKPOINT_R; - - if (str[1] == 'w') - access |= HW_BREAKPOINT_W; - - if (str[2] == 'x') - access |= HW_BREAKPOINT_X; - - switch (access) { - case HW_BREAKPOINT_R: - case HW_BREAKPOINT_W: - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - return access; - default: - return -EINVAL; - } -} - -/* - * There can be several possible malformed requests and we attempt to capture - * all of them. We enumerate some of the rules - * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. - * i.e. multiple ':' symbols disallowed. Possible uses are of the form - * <module>:<ksym_name>:<op>. - * 2. No delimiter symbol ':' in the input string - * 3. Spurious operator symbols or symbols not in their respective positions - * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file - * 5. Kernel symbol not a part of /proc/kallsyms - * 6. Duplicate requests - */ -static int parse_ksym_trace_str(char *input_string, char **ksymname, - unsigned long *addr) -{ - int ret; - - *ksymname = strsep(&input_string, ":"); - *addr = kallsyms_lookup_name(*ksymname); - - /* Check for malformed request: (2), (1) and (5) */ - if ((!input_string) || - (strlen(input_string) != KSYM_TRACER_OP_LEN) || - (*addr == 0)) - return -EINVAL;; - - ret = ksym_trace_get_access_type(input_string); - - return ret; -} - -int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) -{ - struct trace_ksym *entry; - int ret = -ENOMEM; - - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - hw_breakpoint_init(&entry->attr); - - entry->attr.bp_type = op; - entry->attr.bp_addr = addr; - entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - - entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - - if (IS_ERR(entry->ksym_hbp)) { - ret = PTR_ERR(entry->ksym_hbp); - if (ret == -ENOSPC) { - printk(KERN_ERR "ksym_tracer: Maximum limit reached." - " No new requests for tracing can be accepted now.\n"); - } else { - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); - } - goto err; - } - - hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - - return 0; - -err: - kfree(entry); - - return ret; -} - -static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - struct trace_seq *s; - ssize_t cnt = 0; - int ret; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - trace_seq_init(s); - - mutex_lock(&ksym_tracer_mutex); - - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", - (void *)(unsigned long)entry->attr.bp_addr); - if (entry->attr.bp_type == HW_BREAKPOINT_R) - ret = trace_seq_puts(s, "r--\n"); - else if (entry->attr.bp_type == HW_BREAKPOINT_W) - ret = trace_seq_puts(s, "-w-\n"); - else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) - ret = trace_seq_puts(s, "rw-\n"); - WARN_ON_ONCE(!ret); - } - - cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); - - mutex_unlock(&ksym_tracer_mutex); - - kfree(s); - - return cnt; -} - -static void __ksym_trace_reset(void) -{ - struct trace_ksym *entry; - struct hlist_node *node, *node1; - - mutex_lock(&ksym_tracer_mutex); - hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, - ksym_hlist) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - } - mutex_unlock(&ksym_tracer_mutex); -} - -static ssize_t ksym_trace_filter_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - char *buf, *input_string, *ksymname = NULL; - unsigned long ksym_addr = 0; - int ret, op, changed = 0; - - buf = kzalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(buf, buffer, count)) - goto out; - - buf[count] = '\0'; - input_string = strstrip(buf); - - /* - * Clear all breakpoints if: - * 1: echo > ksym_trace_filter - * 2: echo 0 > ksym_trace_filter - * 3: echo "*:---" > ksym_trace_filter - */ - if (!input_string[0] || !strcmp(input_string, "0") || - !strcmp(input_string, "*:---")) { - __ksym_trace_reset(); - ret = 0; - goto out; - } - - ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) - goto out; - - mutex_lock(&ksym_tracer_mutex); - - ret = -EINVAL; - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == ksym_addr) { - /* Check for malformed request: (6) */ - if (entry->attr.bp_type != op) - changed = 1; - else - goto out_unlock; - break; - } - } - if (changed) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - entry->attr.bp_type = op; - ret = 0; - if (op > 0) { - entry->ksym_hbp = - register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - if (IS_ERR(entry->ksym_hbp)) - ret = PTR_ERR(entry->ksym_hbp); - else - goto out_unlock; - } - /* Error or "symbol:---" case: drop it */ - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - goto out_unlock; - } else { - /* Check for malformed request: (4) */ - if (op) - ret = process_new_ksym_entry(ksymname, op, ksym_addr); - } -out_unlock: - mutex_unlock(&ksym_tracer_mutex); -out: - kfree(buf); - return !ret ? count : ret; -} - -static const struct file_operations ksym_tracing_fops = { - .open = tracing_open_generic, - .read = ksym_trace_filter_read, - .write = ksym_trace_filter_write, -}; - -static void ksym_trace_reset(struct trace_array *tr) -{ - ksym_tracing_enabled = 0; - __ksym_trace_reset(); -} - -static int ksym_trace_init(struct trace_array *tr) -{ - int cpu, ret = 0; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - ksym_tracing_enabled = 1; - ksym_trace_array = tr; - - return ret; -} - -static void ksym_trace_print_header(struct seq_file *m) -{ - seq_puts(m, - "# TASK-PID CPU# Symbol " - "Type Function\n"); - seq_puts(m, - "# | | | " - " | |\n"); -} - -static enum print_line_t ksym_trace_output(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct ksym_trace_entry *field; - char str[KSYM_SYMBOL_LEN]; - int ret; - - if (entry->type != TRACE_KSYM) - return TRACE_TYPE_UNHANDLED; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, - entry->pid, iter->cpu, (char *)field->addr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (field->type) { - case HW_BREAKPOINT_R: - ret = trace_seq_printf(s, " R "); - break; - case HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " RW "); - break; - default: - return TRACE_TYPE_PARTIAL_LINE; - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - sprint_symbol(str, field->ip); - ret = trace_seq_printf(s, "%s\n", str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -struct tracer ksym_tracer __read_mostly = -{ - .name = "ksym_tracer", - .init = ksym_trace_init, - .reset = ksym_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_ksym, -#endif - .print_header = ksym_trace_print_header, - .print_line = ksym_trace_output -}; - -#ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_profile_show(struct seq_file *m, void *v) -{ - struct hlist_node *node; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; - - seq_puts(m, " Access Type "); - seq_puts(m, " Symbol Counter\n"); - seq_puts(m, " ----------- "); - seq_puts(m, " ------ -------\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - - access_type = entry->attr.bp_type; - - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); - } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", "<NA>"); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); - } - rcu_read_unlock(); - - return 0; -} - -static int ksym_profile_open(struct inode *node, struct file *file) -{ - return single_open(file, ksym_profile_show, NULL); -} - -static const struct file_operations ksym_profile_fops = { - .open = ksym_profile_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - trace_create_file("ksym_profile", 0444, d_tracer, - NULL, &ksym_profile_fops); -#endif - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b459647..02272baa220 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,9 +16,6 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { .funcs = &trace_wake_funcs, }; -/* TRACE_SPECIAL */ -static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_special_funcs = { - .trace = trace_special_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, -}; - -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, - .funcs = &trace_special_funcs, -}; - /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, @@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_stack_event = { @@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_user_stack_event = { @@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, &trace_wake_event, - &trace_special_event, &trace_stack_event, &trace_user_stack_event, &trace_bprint_event, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2ef8c..4086eae6e81 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -46,7 +46,6 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int resched; int cpu; int pc; @@ -54,7 +53,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) @@ -74,7 +73,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); out_enable: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -383,6 +382,7 @@ static struct tracer wakeup_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -397,6 +397,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .use_max_tr = 1, }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9bd2f..155a415b320 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_WAKE: case TRACE_STACK: case TRACE_PRINT: - case TRACE_SPECIAL: case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_KSYM: return 1; } return 0; @@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) @@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_KSYM_TRACER -static int ksym_selftest_dummy; - -int -trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - ksym_selftest_dummy = 0; - /* Register the read-write tracing request */ - - ret = process_new_ksym_entry("ksym_selftest_dummy", - HW_BREAKPOINT_R | HW_BREAKPOINT_W, - (unsigned long)(&ksym_selftest_dummy)); - - if (ret < 0) { - printk(KERN_CONT "ksym_trace read-write startup test failed\n"); - goto ret_path; - } - /* Perform a read and a write operation over the dummy variable to - * trigger the tracer - */ - if (ksym_selftest_dummy == 0) - ksym_selftest_dummy++; - - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - /* read & write operations - one each is performed on the dummy variable - * triggering two entries in the trace buffer - */ - if (!ret && count != 2) { - printk(KERN_CONT "Ksym tracer startup test failed"); - ret = -1; - } - -ret_path: - return ret; -} -#endif /* CONFIG_KSYM_TRACER */ - diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b27de5..056468eae7c 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -110,12 +110,12 @@ static inline void check_stack(void) static void stack_trace_call(unsigned long ip, unsigned long parent_ip) { - int cpu, resched; + int cpu; if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e35804304..bac752f0cfb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); +/* All syscall exit events have the same fields */ +static LIST_HEAD(syscall_exit_fields); + static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) static struct list_head * syscall_get_exit_fields(struct ftrace_event_call *call) { - struct syscall_metadata *entry = call->data; - - return &entry->exit_fields; + return &syscall_exit_fields; } struct trace_event_functions enter_syscall_print_funcs = { diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a552ca..00000000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> - * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/hrtimer.h> -#include <linux/uaccess.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/irq.h> -#include <linux/fs.h> - -#include <asm/stacktrace.h> - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(void *unused) -{ - struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); -} - -static void start_stack_timers(void) -{ - on_each_cpu(start_stack_timer, NULL, 1); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static int stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - tracing_start_cmdline_record(); - - mutex_lock(&sample_timer_lock); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); - return 0; -} - -static void stack_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static const struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); -} diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d38dff..25915832291 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 00000000000..613bc1f0461 --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,567 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/lockdep.h> +#include <linux/notifier.h> +#include <linux/module.h> +#include <linux/sysctl.h> + +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +int watchdog_enabled; +int __read_mostly softlockup_thresh = 60; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +static int __read_mostly did_panic; +static int __initdata no_watchdog; + + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int hardlockup_panic; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert softlockup_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return softlockup_thresh / 5 * NSEC_PER_SEC; +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = smp_processor_id(); + + __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +} + +void touch_softlockup_watchdog(void) +{ + __get_cpu_var(watchdog_touch_ts) = 0; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void touch_nmi_watchdog(void) +{ + __get_cpu_var(watchdog_nmi_touch) = true; + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +#endif + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* watchdog detector functions */ +static int is_hardlockup(void) +{ + unsigned long hrint = __get_cpu_var(hrtimer_interrupts); + + if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) + return 1; + + __get_cpu_var(hrtimer_interrupts_saved) = hrint; + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts) +{ + unsigned long now = get_timestamp(smp_processor_id()); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + softlockup_thresh)) + return now - touch_ts; + + return 0; +} + +static int +watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = watchdog_panic, +}; + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (__get_cpu_var(watchdog_nmi_touch) == true) { + __get_cpu_var(watchdog_nmi_touch) = false; + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__get_cpu_var(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __get_cpu_var(hard_watchdog_warn) = true; + return; + } + + __get_cpu_var(hard_watchdog_warn) = false; + return; +} +static void watchdog_interrupt_count(void) +{ + __get_cpu_var(hrtimer_interrupts)++; +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__get_cpu_var(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(__get_cpu_var(softlockup_touch_sync))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + __get_cpu_var(softlockup_touch_sync) = false; + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts); + if (unlikely(duration)) { + /* only warn once */ + if (__get_cpu_var(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + smp_processor_id(), duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __get_cpu_var(soft_watchdog_warn) = true; + } else + __get_cpu_var(soft_watchdog_warn) = false; + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *unused) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in watchdog_timer_fn(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + return -1; + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* prepare/enable/disable routines */ +static int watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + return 0; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + + /* enable the perf event */ + if (watchdog_nmi_enable(cpu) != 0) + return -1; + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + return -1; + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + + return 0; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } + + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + int result = 0; + + for_each_online_cpu(cpu) + result += watchdog_enable(cpu); + + if (result) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ + +int proc_dowatchdog_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + return 0; +} + +int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (watchdog_prepare_cpu(hotcpu)) + return NOTIFY_BAD; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (watchdog_enable(hotcpu)) + return NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init spawn_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (no_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(err == NOTIFY_BAD); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; +} +early_initcall(spawn_watchdog_task); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2deb445..2994a0e3a61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,41 +33,287 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> -#define CREATE_TRACE_POINTS -#include <trace/events/workqueue.h> +#include <linux/idr.h> + +#include "workqueue_sched.h" + +enum { + /* global_cwq flags */ + GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ + GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 3, /* freeze in progress */ + GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + + /* worker flags */ + WORKER_STARTED = 1 << 0, /* started */ + WORKER_DIE = 1 << 1, /* die die die */ + WORKER_IDLE = 1 << 2, /* is idle */ + WORKER_PREP = 1 << 3, /* preparing to run works */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ + + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ + BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, + BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, + + MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ + IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ + + MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ + CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ + + /* + * Rescue workers are used only on emergencies and shared by + * all cpus. Give -20. + */ + RESCUER_NICE_LEVEL = -20, +}; /* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). + * Structure fields follow one of the following exclusion rules. + * + * I: Set during initialization and read-only afterwards. + * + * P: Preemption protected. Disabling preemption is enough and should + * only be modified and accessed from the local cpu. + * + * L: gcwq->lock protected. Access with gcwq->lock held. + * + * X: During normal operation, modification requires gcwq->lock and + * should be done only from local cpu. Either disabling preemption + * on local cpu or grabbing gcwq->lock is enough for read access. + * If GCWQ_DISASSOCIATED is set, it's identical to L. + * + * F: wq->flush_mutex protected. + * + * W: workqueue_lock protected. */ -struct cpu_workqueue_struct { - spinlock_t lock; +struct global_cwq; - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; +/* + * The poor guys doing the actual heavy lifting. All on-duty workers + * are either serving the manager role, on idle list or on busy hash. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; - struct workqueue_struct *wq; - struct task_struct *thread; -} ____cacheline_aligned; + struct work_struct *current_work; /* L: work being processed */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct global_cwq *gcwq; /* I: the associated gcwq */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ +}; + +/* + * Global per-cpu workqueue. There's one and only one for each cpu + * and all works are queued and processed here regardless of their + * target workqueues. + */ +struct global_cwq { + spinlock_t lock; /* the gcwq lock */ + struct list_head worklist; /* L: list of pending works */ + unsigned int cpu; /* I: the associated cpu */ + unsigned int flags; /* L: GCWQ_* flags */ + + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + /* workers are chained either in the idle_list or busy_hash */ + struct list_head idle_list; /* X: list of idle workers */ + struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* L: hash of busy workers */ + + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for dworkers */ + + struct ida worker_ida; /* L: for worker IDs */ + + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker *first_idle; /* L: first idle worker */ +} ____cacheline_aligned_in_smp; + +/* + * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. + */ +struct cpu_workqueue_struct { + struct global_cwq *gcwq; /* I: the associated gcwq */ + struct workqueue_struct *wq; /* I: the owning workqueue */ + int work_color; /* L: current color */ + int flush_color; /* L: flushing color */ + int nr_in_flight[WORK_NR_COLORS]; + /* L: nr of in_flight works */ + int nr_active; /* L: nr of active works */ + int max_active; /* L: max active works */ + struct list_head delayed_works; /* L: delayed works */ +}; + +/* + * Structure used to wait for workqueue flush. + */ +struct wq_flusher { + struct list_head list; /* F: list of flushers */ + int flush_color; /* F: flush color waiting for */ + struct completion done; /* flush completion */ +}; + +/* + * All cpumasks are assumed to be always set on UP and thus can't be + * used to determine whether there's something to be done. + */ +#ifdef CONFIG_SMP +typedef cpumask_var_t mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) \ + cpumask_test_and_set_cpu((cpu), (mask)) +#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) +#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) +#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) +#define free_mayday_mask(mask) free_cpumask_var((mask)) +#else +typedef unsigned long mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) +#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) +#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) +#define alloc_mayday_mask(maskp, gfp) true +#define free_mayday_mask(mask) do { } while (0) +#endif /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; + unsigned int flags; /* I: WQ_* flags */ + union { + struct cpu_workqueue_struct __percpu *pcpu; + struct cpu_workqueue_struct *single; + unsigned long v; + } cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + + struct mutex flush_mutex; /* protects wq flushing */ + int work_color; /* F: current work color */ + int flush_color; /* F: current flush color */ + atomic_t nr_cwqs_to_flush; /* flush in progress */ + struct wq_flusher *first_flusher; /* F: first flusher */ + struct list_head flusher_queue; /* F: flush waiters */ + struct list_head flusher_overflow; /* F: flush overflow list */ + + mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct worker *rescuer; /* I: rescue worker */ + + int saved_max_active; /* W: saved cwq max_active */ + const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; +struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_nrt_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_long_wq); +EXPORT_SYMBOL_GPL(system_nrt_wq); +EXPORT_SYMBOL_GPL(system_unbound_wq); + +#define for_each_busy_worker(worker, i, pos, gcwq) \ + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ + hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + +static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) +{ + if (cpu < nr_cpu_ids) { + if (sw & 1) { + cpu = cpumask_next(cpu, mask); + if (cpu < nr_cpu_ids) + return cpu; + } + if (sw & 2) + return WORK_CPU_UNBOUND; + } + return WORK_CPU_NONE; +} + +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) +{ + return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +} + +/* + * CPU iterators + * + * An extra gcwq is defined for an invalid cpu number + * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any + * specific CPU. The following iterators are similar to + * for_each_*_cpu() iterators but also considers the unbound gcwq. + * + * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * WORK_CPU_UNBOUND for unbound workqueues + */ +#define for_each_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + +#define for_each_online_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + +#define for_each_cwq_cpu(cpu, wq) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + +#ifdef CONFIG_LOCKDEP +/** + * in_workqueue_context() - in context of specified workqueue? + * @wq: the workqueue of interest + * + * Checks lockdep state to see if the current task is executing from + * within a workqueue item. This function exists only if lockdep is + * enabled. + */ +int in_workqueue_context(struct workqueue_struct *wq) +{ + return lock_is_held(&wq->lockdep_map); +} +#endif + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; @@ -107,7 +353,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); return 0; @@ -181,94 +427,575 @@ static inline void debug_work_deactivate(struct work_struct *work) { } /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* + * The almighty global cpu workqueues. nr_running is the only field + * which is expected to be used frequently by other cpus via + * try_to_wake_up(). Put it in a separate cacheline. + */ +static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); -static int singlethread_cpu __read_mostly; -static const struct cpumask *cpu_singlethread_map __read_mostly; /* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. + * Global cpu workqueue and nr_running counter for unbound gcwq. The + * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its + * workers have WORKER_UNBOUND set. */ -static cpumask_var_t cpu_populated_map __read_mostly; +static struct global_cwq unbound_global_cwq; +static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) +static int worker_thread(void *__worker); + +static struct global_cwq *get_gcwq(unsigned int cpu) { - return wq->singlethread; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(global_cwq, cpu); + else + return &unbound_global_cwq; } -static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +static atomic_t *get_gcwq_nr_running(unsigned int cpu) { - return is_wq_single_threaded(wq) - ? cpu_singlethread_map : cpu_populated_map; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(gcwq_nr_running, cpu); + else + return &unbound_gcwq_nr_running; } -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) { - if (unlikely(is_wq_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); + if (!(wq->flags & WQ_UNBOUND)) { + if (likely(cpu < nr_cpu_ids)) { +#ifdef CONFIG_SMP + return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +#else + return wq->cpu_wq.single; +#endif + } + } else if (likely(cpu == WORK_CPU_UNBOUND)) + return wq->cpu_wq.single; + return NULL; +} + +static unsigned int work_color_to_flags(int color) +{ + return color << WORK_STRUCT_COLOR_SHIFT; +} + +static int get_work_color(struct work_struct *work) +{ + return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + ((1 << WORK_STRUCT_COLOR_BITS) - 1); +} + +static int work_next_color(int color) +{ + return (color + 1) % WORK_NR_COLORS; } /* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set + * A work's data points to the cwq with WORK_STRUCT_CWQ set while the + * work is on queue. Once execution starts, WORK_STRUCT_CWQ is + * cleared and the work data contains the cpu number it was last on. + * + * set_work_{cwq|cpu}() and clear_work_data() can be used to set the + * cwq, cpu or clear work->data. These functions should only be + * called while the work is owned - ie. while the PENDING bit is set. + * + * get_work_[g]cwq() can be used to obtain the gcwq or cwq + * corresponding to a work. gcwq is available once the work has been + * queued anywhere after initialization. cwq is available only from + * queueing until execution starts. */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) +static inline void set_work_data(struct work_struct *work, unsigned long data, + unsigned long flags) { - unsigned long new; - BUG_ON(!work_pending(work)); + atomic_long_set(&work->data, data | flags | work_static(work)); +} + +static void set_work_cwq(struct work_struct *work, + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) +{ + set_work_data(work, (unsigned long)cwq, + WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +} + +static void set_work_cpu(struct work_struct *work, unsigned int cpu) +{ + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +} + +static void clear_work_data(struct work_struct *work) +{ + set_work_data(work, WORK_STRUCT_NO_CPU, 0); +} + +static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + else + return NULL; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + unsigned int cpu; + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + + cpu = data >> WORK_STRUCT_FLAG_BITS; + if (cpu == WORK_CPU_NONE) + return NULL; + + BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); + return get_gcwq(cpu); +} + +/* + * Policy functions. These define the policies on how the global + * worker pool is managed. Unless noted otherwise, these functions + * assume that they're being called with gcwq->lock held. + */ + +static bool __need_more_worker(struct global_cwq *gcwq) +{ + return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || + gcwq->flags & GCWQ_HIGHPRI_PENDING; +} + +/* + * Need to wake up a worker? Called from anything but currently + * running workers. + */ +static bool need_more_worker(struct global_cwq *gcwq) +{ + return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); +} + +/* Can I start working? Called from busy but !running workers. */ +static bool may_start_working(struct global_cwq *gcwq) +{ + return gcwq->nr_idle; +} + +/* Do I need to keep working? Called from currently running workers. */ +static bool keep_working(struct global_cwq *gcwq) +{ + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; +} + +/* Do we need a new worker? Called from manager. */ +static bool need_to_create_worker(struct global_cwq *gcwq) +{ + return need_more_worker(gcwq) && !may_start_working(gcwq); +} + +/* Do I need to be the manager? */ +static bool need_to_manage_workers(struct global_cwq *gcwq) +{ + return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; +} + +/* Do we have too many workers and should some go away? */ +static bool too_many_workers(struct global_cwq *gcwq) +{ + bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->nr_workers - nr_idle; - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } /* - * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + * Wake up functions. */ -static inline void clear_wq_data(struct work_struct *work) + +/* Return the first worker. Safe with preemption disabled */ +static struct worker *first_worker(struct global_cwq *gcwq) { - unsigned long flags = *work_data_bits(work) & - (1UL << WORK_STRUCT_STATIC); - atomic_long_set(&work->data, flags); + if (unlikely(list_empty(&gcwq->idle_list))) + return NULL; + + return list_first_entry(&gcwq->idle_list, struct worker, entry); } -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +/** + * wake_up_worker - wake up an idle worker + * @gcwq: gcwq to wake worker for + * + * Wake up the first idle worker of @gcwq. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void wake_up_worker(struct global_cwq *gcwq) { - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + struct worker *worker = first_worker(gcwq); + + if (likely(worker)) + wake_up_process(worker->task); +} + +/** + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +{ + struct worker *worker = kthread_data(task); + + if (likely(!(worker->flags & WORKER_NOT_RUNNING))) + atomic_inc(get_gcwq_nr_running(cpu)); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * RETURNS: + * Worker task on @cpu to wake up, %NULL if none. + */ +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct global_cwq *gcwq = get_gcwq(cpu); + atomic_t *nr_running = get_gcwq_nr_running(cpu); + + if (unlikely(worker->flags & WORKER_NOT_RUNNING)) + return NULL; + + /* this can only happen on the local cpu */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. + * + * NOT_RUNNING is clear. This means that trustee is not in + * charge and we're running on the local cpu w/ rq lock held + * and preemption disabled, which in turn means that none else + * could be manipulating idle_list, so dereferencing idle_list + * without gcwq lock is safe. + */ + if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + to_wakeup = first_worker(gcwq); + return to_wakeup ? to_wakeup->task : NULL; +} + +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * @wakeup: wakeup an idle worker if necessary + * + * Set @flags in @worker->flags and adjust nr_running accordingly. If + * nr_running becomes zero and @wakeup is %true, an idle worker is + * woken up. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags, + bool wakeup) +{ + struct global_cwq *gcwq = worker->gcwq; + + WARN_ON_ONCE(worker->task != current); + + /* + * If transitioning into NOT_RUNNING, adjust nr_running and + * wake up an idle worker as necessary if requested by + * @wakeup. + */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + if (wakeup) { + if (atomic_dec_and_test(nr_running) && + !list_empty(&gcwq->worklist)) + wake_up_worker(gcwq); + } else + atomic_dec(nr_running); + } + + worker->flags |= flags; } +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct global_cwq *gcwq = worker->gcwq; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* if transitioning out of NOT_RUNNING, increment nr_running */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(gcwq->cpu)); +} + +/** + * busy_worker_head - return the busy hash head for a work + * @gcwq: gcwq of interest + * @work: work to be hashed + * + * Return hash head of @gcwq for @work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to the hash head. + */ +static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, + struct work_struct *work) +{ + const int base_shift = ilog2(sizeof(struct work_struct)); + unsigned long v = (unsigned long)work; + + /* simple shift and fold hash, do we need something better? */ + v >>= base_shift; + v += v >> BUSY_WORKER_HASH_ORDER; + v &= BUSY_WORKER_HASH_MASK; + + return &gcwq->busy_hash[v]; +} + +/** + * __find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @bwh: hash head as returned by busy_worker_head() + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. @bwh should be + * the hash head obtained by calling busy_worker_head() with the same + * work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, + struct hlist_head *bwh, + struct work_struct *work) +{ + struct worker *worker; + struct hlist_node *tmp; + + hlist_for_each_entry(worker, tmp, bwh, hentry) + if (worker->current_work == work) + return worker; + return NULL; +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. This function is + * identical to __find_worker_executing_work() except that this + * function calculates @bwh itself. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct global_cwq *gcwq, + struct work_struct *work) +{ + return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), + work); +} + +/** + * gcwq_determine_ins_pos - find insertion position + * @gcwq: gcwq of interest + * @cwq: cwq a work is being queued for + * + * A work for @cwq is about to be queued on @gcwq, determine insertion + * position for the work. If @cwq is for HIGHPRI wq, the work is + * queued at the head of the queue but in FIFO order with respect to + * other HIGHPRI works; otherwise, at the end of the queue. This + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * there are HIGHPRI works pending. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to inserstion position. + */ +static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, + struct cpu_workqueue_struct *cwq) +{ + struct work_struct *twork; + + if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) + return &gcwq->worklist; + + list_for_each_entry(twork, &gcwq->worklist, entry) { + struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); + + if (!(tcwq->wq->flags & WQ_HIGHPRI)) + break; + } + + gcwq->flags |= GCWQ_HIGHPRI_PENDING; + return &twork->entry; +} + +/** + * insert_work - insert a work into gcwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work which belongs to @cwq into @gcwq after @head. + * @extra_flags is or'd to work_struct flags. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) { - trace_workqueue_insertion(cwq->thread, work); + struct global_cwq *gcwq = cwq->gcwq; + + /* we own @work, set data and link */ + set_work_cwq(work, cwq, extra_flags); - set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); + list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); + + /* + * Ensure either worker_sched_deactivated() sees the above + * list_add_tail() or we see zero nr_running to avoid workers + * lying around lazily while there are works to be processed. + */ + smp_mb(); + + if (__need_more_worker(gcwq)) + wake_up_worker(gcwq); } -static void __queue_work(struct cpu_workqueue_struct *cwq, +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; unsigned long flags; debug_work_activate(work); - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); + + /* determine gcwq to use */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *last_gcwq; + + if (unlikely(cpu == WORK_CPU_UNBOUND)) + cpu = raw_smp_processor_id(); + + /* + * It's multi cpu. If @wq is non-reentrant and @work + * was previously on a different cpu, it might still + * be running there, in which case the work needs to + * be queued on that cpu to guarantee non-reentrance. + */ + gcwq = get_gcwq(cpu); + if (wq->flags & WQ_NON_REENTRANT && + (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + struct worker *worker; + + spin_lock_irqsave(&last_gcwq->lock, flags); + + worker = find_worker_executing_work(last_gcwq, work); + + if (worker && worker->current_cwq->wq == wq) + gcwq = last_gcwq; + else { + /* meh... not running there, queue here */ + spin_unlock_irqrestore(&last_gcwq->lock, flags); + spin_lock_irqsave(&gcwq->lock, flags); + } + } else + spin_lock_irqsave(&gcwq->lock, flags); + } else { + gcwq = get_gcwq(WORK_CPU_UNBOUND); + spin_lock_irqsave(&gcwq->lock, flags); + } + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + + if (likely(cwq->nr_active < cwq->max_active)) { + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else + worklist = &cwq->delayed_works; + + insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -308,9 +1035,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); ret = 1; } return ret; @@ -320,10 +1046,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } /** @@ -360,14 +1085,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + unsigned int lcpu; + BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + /* + * This stores cwq for the moment, for the timer_fn. + * Note that the work's gcwq is preserved to allow + * reentrance detection for delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else + lcpu = WORK_CPU_UNBOUND; + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -382,80 +1124,872 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -static void run_workqueue(struct cpu_workqueue_struct *cwq) +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_enter_idle(struct worker *worker) { - spin_lock_irq(&cwq->lock); - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(worker->flags & WORKER_IDLE); + BUG_ON(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev)); + + /* can't use worker_set_flags(), also called from start_worker() */ + worker->flags |= WORKER_IDLE; + gcwq->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &gcwq->idle_list); + + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); + + /* sanity check nr_running */ + WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + atomic_read(get_gcwq_nr_running(gcwq->cpu))); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(!(worker->flags & WORKER_IDLE)); + worker_clr_flags(worker, WORKER_IDLE); + gcwq->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * @worker: self + * + * Works which are scheduled while the cpu is online must at least be + * scheduled to a worker which is bound to the cpu so that if they are + * flushed from cpu callbacks while cpu is going down, they are + * guaranteed to execute on the cpu. + * + * This function is to be used by rogue workers and rescuers to bind + * themselves to the target cpu and may race with cpu going down or + * coming online. kthread_bind() can't be used because it may put the + * worker to already dead cpu and set_cpus_allowed_ptr() can't be used + * verbatim as it's best effort and blocking and gcwq may be + * [dis]associated in the meantime. + * + * This function tries set_cpus_allowed() and locks gcwq and verifies + * the binding against GCWQ_DISASSOCIATED which is set during + * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters + * idle state or fetches works without dropping lock, it can guarantee + * the scheduling requirement described in the first paragraph. + * + * CONTEXT: + * Might sleep. Called without any lock but returns with gcwq->lock + * held. + * + * RETURNS: + * %true if the associated gcwq is online (@worker is successfully + * bound), %false if offline. + */ +static bool worker_maybe_bind_and_lock(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; + + while (true) { /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. + * The following call may fail, succeed or succeed + * without actually migrating the task to the cpu if + * it races with cpu hotunplug operation. Verify + * against GCWQ_DISASSOCIATED. */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - trace_workqueue_execution(cwq->thread, work); - debug_work_deactivate(work); - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + + spin_lock_irq(&gcwq->lock); + if (gcwq->flags & GCWQ_DISASSOCIATED) + return false; + if (task_cpu(task) == gcwq->cpu && + cpumask_equal(¤t->cpus_allowed, + get_cpu_mask(gcwq->cpu))) + return true; + spin_unlock_irq(&gcwq->lock); + + /* CPU has come up inbetween, retry migration */ + cpu_relax(); + } +} + +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + +static struct worker *alloc_worker(void) +{ + struct worker *worker; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (worker) { + INIT_LIST_HEAD(&worker->entry); + INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); + /* on creation a worker is in !idle && prep state */ + worker->flags = WORKER_PREP; + } + return worker; +} + +/** + * create_worker - create a new workqueue worker + * @gcwq: gcwq the new worker will belong to + * @bind: whether to set affinity to @cpu or not + * + * Create a new worker which is bound to @gcwq. The returned worker + * can be started by calling start_worker() or destroyed using + * destroy_worker(). + * + * CONTEXT: + * Might sleep. Does GFP_KERNEL allocations. + * + * RETURNS: + * Pointer to the newly created worker. + */ +static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +{ + bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker *worker = NULL; + int id = -1; + + spin_lock_irq(&gcwq->lock); + while (ida_get_new(&gcwq->worker_ida, &id)) { + spin_unlock_irq(&gcwq->lock); + if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + goto fail; + spin_lock_irq(&gcwq->lock); + } + spin_unlock_irq(&gcwq->lock); + + worker = alloc_worker(); + if (!worker) + goto fail; + + worker->gcwq = gcwq; + worker->id = id; + + if (!on_unbound_cpu) + worker->task = kthread_create(worker_thread, worker, + "kworker/%u:%d", gcwq->cpu, id); + else + worker->task = kthread_create(worker_thread, worker, + "kworker/u:%d", id); + if (IS_ERR(worker->task)) + goto fail; + + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ + if (bind && !on_unbound_cpu) + kthread_bind(worker->task, gcwq->cpu); + else { + worker->task->flags |= PF_THREAD_BOUND; + if (on_unbound_cpu) + worker->flags |= WORKER_UNBOUND; + } + + return worker; +fail: + if (id >= 0) { + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); + spin_unlock_irq(&gcwq->lock); + } + kfree(worker); + return NULL; +} + +/** + * start_worker - start a newly created worker + * @worker: worker to start + * + * Make the gcwq aware of @worker and start it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void start_worker(struct worker *worker) +{ + worker->flags |= WORKER_STARTED; + worker->gcwq->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); +} + +/** + * destroy_worker - destroy a workqueue worker + * @worker: worker to be destroyed + * + * Destroy @worker and adjust @gcwq stats accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void destroy_worker(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + int id = worker->id; + + /* sanity check frenzy */ + BUG_ON(worker->current_work); + BUG_ON(!list_empty(&worker->scheduled)); + + if (worker->flags & WORKER_STARTED) + gcwq->nr_workers--; + if (worker->flags & WORKER_IDLE) + gcwq->nr_idle--; + + list_del_init(&worker->entry); + worker->flags |= WORKER_DIE; + + spin_unlock_irq(&gcwq->lock); + + kthread_stop(worker->task); + kfree(worker); + + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); +} + +static void idle_worker_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + + spin_lock_irq(&gcwq->lock); + + if (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + /* idle_list is kept in LIFO order, check the last one */ + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) + mod_timer(&gcwq->idle_timer, expires); + else { + /* it's been idle for too long, wake up manager */ + gcwq->flags |= GCWQ_MANAGE_WORKERS; + wake_up_worker(gcwq); + } + } + + spin_unlock_irq(&gcwq->lock); +} + +static bool send_mayday(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct workqueue_struct *wq = cwq->wq; + unsigned int cpu; + + if (!(wq->flags & WQ_RESCUER)) + return false; + + /* mayday mayday mayday */ + cpu = cwq->gcwq->cpu; + /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ + if (cpu == WORK_CPU_UNBOUND) + cpu = 0; + if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + wake_up_process(wq->rescuer->task); + return true; +} + +static void gcwq_mayday_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + struct work_struct *work; + + spin_lock_irq(&gcwq->lock); + + if (need_to_create_worker(gcwq)) { + /* + * We've been trying to create a new worker but + * haven't been successful. We might be hitting an + * allocation deadlock. Send distress signals to + * rescuers. + */ + list_for_each_entry(work, &gcwq->worklist, entry) + send_mayday(work); + } + + spin_unlock_irq(&gcwq->lock); + + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); +} + +/** + * maybe_create_worker - create a new worker if necessary + * @gcwq: gcwq to create a new worker for + * + * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * have at least one idle worker on return from this function. If + * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is + * sent to all rescuers with works scheduled on @gcwq to resolve + * possible allocation deadlock. + * + * On return, need_to_create_worker() is guaranteed to be false and + * may_start_working() true. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_create_worker(struct global_cwq *gcwq) +{ + if (!need_to_create_worker(gcwq)) + return false; +restart: + spin_unlock_irq(&gcwq->lock); + + /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + + while (true) { + struct worker *worker; + + worker = create_worker(gcwq, true); + if (worker) { + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + BUG_ON(need_to_create_worker(gcwq)); + return true; } - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; + if (!need_to_create_worker(gcwq)) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CREATE_COOLDOWN); + + if (!need_to_create_worker(gcwq)) + break; } - spin_unlock_irq(&cwq->lock); + + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) + goto restart; + return true; } -static int worker_thread(void *__cwq) +/** + * maybe_destroy_worker - destroy workers which have been idle for a while + * @gcwq: gcwq to destroy workers for + * + * Destroy @gcwq workers which have been idle for longer than + * IDLE_WORKER_TIMEOUT. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Called only from manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_destroy_workers(struct global_cwq *gcwq) { - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); + bool ret = false; - if (cwq->wq->freezeable) - set_freezable(); + while (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; - try_to_freeze(); + if (time_before(jiffies, expires)) { + mod_timer(&gcwq->idle_timer, expires); + break; + } - if (kthread_should_stop()) + destroy_worker(worker); + ret = true; + } + + return ret; +} + +/** + * manage_workers - manage worker pool + * @worker: self + * + * Assume the manager role and manage gcwq worker pool @worker belongs + * to. At any given time, there can be only zero or one manager per + * gcwq. The exclusion is handled automatically by this function. + * + * The caller can safely start processing works on false return. On + * true return, it's guaranteed that need_to_create_worker() is false + * and may_start_working() is true. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true if + * some action was taken. + */ +static bool manage_workers(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + bool ret = false; + + if (gcwq->flags & GCWQ_MANAGING_WORKERS) + return ret; + + gcwq->flags &= ~GCWQ_MANAGE_WORKERS; + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + /* + * Destroy and then create so that may_start_working() is true + * on return. + */ + ret |= maybe_destroy_workers(gcwq); + ret |= maybe_create_worker(gcwq); + + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + + return ret; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + move_linked_works(work, pos, NULL); + cwq->nr_active++; +} - run_workqueue(cwq); +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + cwq->nr_active--; + + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); } - return 0; + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * process_one_work - process single work + * @worker: self + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct worker *worker, struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct global_cwq *gcwq = cwq->gcwq; + struct hlist_head *bwh = busy_worker_head(gcwq, work); + bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + work_func_t f = work->func; + int work_color; + struct worker *collision; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* + * A single work shouldn't be executed concurrently by + * multiple workers on a single cpu. Check whether anyone is + * already processing the work. If so, defer the work to the + * currently executing one. + */ + collision = __find_worker_executing_work(gcwq, bwh, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, NULL); + return; + } + + /* claim and process */ + debug_work_deactivate(work); + hlist_add_head(&worker->hentry, bwh); + worker->current_work = work; + worker->current_cwq = cwq; + work_color = get_work_color(work); + + /* record the current cpu number in the work data and dequeue */ + set_work_cpu(work, gcwq->cpu); + list_del_init(&work->entry); + + /* + * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, + * wake up another worker; otherwise, clear HIGHPRI_PENDING. + */ + if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + struct work_struct *nwork = list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (!list_empty(&gcwq->worklist) && + get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) + wake_up_worker(gcwq); + else + gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + } + + /* + * CPU intensive works don't participate in concurrency + * management. They're the scheduler's responsibility. + */ + if (unlikely(cpu_intensive)) + worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + + spin_unlock_irq(&gcwq->lock); + + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + f(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&gcwq->lock); + + /* clear cpu intensive status */ + if (unlikely(cpu_intensive)) + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + + /* we're done with it, release */ + hlist_del_init(&worker->hentry); + worker->current_work = NULL; + worker->current_cwq = NULL; + cwq_dec_nr_in_flight(cwq, work_color); +} + +/** + * process_scheduled_works - process scheduled works + * @worker: self + * + * Process all scheduled works. Please note that the scheduled list + * may change while processing a work, so this function repeatedly + * fetches a work from the top and executes it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. + */ +static void process_scheduled_works(struct worker *worker) +{ + while (!list_empty(&worker->scheduled)) { + struct work_struct *work = list_first_entry(&worker->scheduled, + struct work_struct, entry); + process_one_work(worker, work); + } +} + +/** + * worker_thread - the worker thread function + * @__worker: self + * + * The gcwq worker thread function. There's a single dynamic pool of + * these per each cpu. These workers process all works regardless of + * their specific target workqueue. The only exception is works which + * belong to workqueues with a rescuer which will be explained in + * rescuer_thread(). + */ +static int worker_thread(void *__worker) +{ + struct worker *worker = __worker; + struct global_cwq *gcwq = worker->gcwq; + + /* tell the scheduler that this is a workqueue worker */ + worker->task->flags |= PF_WQ_WORKER; +woke_up: + spin_lock_irq(&gcwq->lock); + + /* DIE can be set only while we're idle, checking here is enough */ + if (worker->flags & WORKER_DIE) { + spin_unlock_irq(&gcwq->lock); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + worker_leave_idle(worker); +recheck: + /* no more worker necessary? */ + if (!need_more_worker(gcwq)) + goto sleep; + + /* do we need to manage? */ + if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * ->scheduled list can only be filled while a worker is + * preparing to process a work or actually processing it. + * Make sure nobody diddled with it while I was sleeping. + */ + BUG_ON(!list_empty(&worker->scheduled)); + + /* + * When control reaches this point, we're guaranteed to have + * at least one idle worker or that someone else has already + * assumed the manager role. + */ + worker_clr_flags(worker, WORKER_PREP); + + do { + struct work_struct *work = + list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { + /* optimization path, not strictly necessary */ + process_one_work(worker, work); + if (unlikely(!list_empty(&worker->scheduled))) + process_scheduled_works(worker); + } else { + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); + } + } while (keep_working(gcwq)); + + worker_set_flags(worker, WORKER_PREP, false); +sleep: + if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * gcwq->lock is held and there's no work to process and no + * need to manage, sleep. Workers are woken up only while + * holding gcwq->lock or from local cpu, so setting the + * current state before releasing gcwq->lock is enough to + * prevent losing any event. + */ + worker_enter_idle(worker); + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&gcwq->lock); + schedule(); + goto woke_up; +} + +/** + * rescuer_thread - the rescuer thread function + * @__wq: the associated workqueue + * + * Workqueue rescuer thread function. There's one rescuer for each + * workqueue which has WQ_RESCUER set. + * + * Regular work processing on a gcwq may block trying to create a new + * worker which uses GFP_KERNEL allocation which has slight chance of + * developing into deadlock if some works currently on the same queue + * need to be processed to satisfy the GFP_KERNEL allocation. This is + * the problem rescuer solves. + * + * When such condition is possible, the gcwq summons rescuers of all + * workqueues which have works queued on the gcwq and let them process + * those works so that forward progress can be guaranteed. + * + * This should happen rarely. + */ +static int rescuer_thread(void *__wq) +{ + struct workqueue_struct *wq = __wq; + struct worker *rescuer = wq->rescuer; + struct list_head *scheduled = &rescuer->scheduled; + bool is_unbound = wq->flags & WQ_UNBOUND; + unsigned int cpu; + + set_user_nice(current, RESCUER_NICE_LEVEL); +repeat: + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + /* + * See whether any cpu is asking for help. Unbounded + * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. + */ + for_each_mayday_cpu(cpu, wq->mayday_mask) { + unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; + struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + struct work_struct *work, *n; + + __set_current_state(TASK_RUNNING); + mayday_clear_cpu(cpu, wq->mayday_mask); + + /* migrate to the target cpu if possible */ + rescuer->gcwq = gcwq; + worker_maybe_bind_and_lock(rescuer); + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + BUG_ON(!list_empty(&rescuer->scheduled)); + list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + if (get_work_cwq(work) == cwq) + move_linked_works(work, scheduled, &n); + + process_scheduled_works(rescuer); + spin_unlock_irq(&gcwq->lock); + } + + schedule(); + goto repeat; } struct wq_barrier { @@ -469,44 +2003,137 @@ static void wq_barrier_func(struct work_struct *work) complete(&barr->done); } +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @target: target work to attach @barr to + * @worker: worker currently executing @target, NULL if @target is not executing + * + * @barr is linked to @target such that @barr is completed only after + * @target finishes execution. Please note that the ordering + * guarantee is observed only with respect to @target and on the local + * cpu. + * + * Currently, a queued barrier can't be canceled. This is because + * try_to_grab_pending() can't determine whether the work to be + * grabbed is at the head of the queue and thus can't clear LINKED + * flag of the previous work while there must be a valid next work + * after a work with LINKED flag set. + * + * Note that when @worker is non-NULL, @target may be modified + * underneath us, so we can't reliably determine cwq from @target. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) + struct wq_barrier *barr, + struct work_struct *target, struct worker *worker) { + struct list_head *head; + unsigned int linked = 0; + /* - * debugobject calls are safe here even with cwq->lock locked + * debugobject calls are safe here even with gcwq->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + /* + * If @target is currently being executed, schedule the + * barrier to the worker; otherwise, put it after @target. + */ + if (worker) + head = worker->scheduled.next; + else { + unsigned long *bits = work_data_bits(target); + + head = target->entry.next; + /* there can already be other linked works, inherit and set */ + linked = *bits & WORK_STRUCT_LINKED; + __set_bit(WORK_STRUCT_LINKED_BIT, bits); + } + debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head); + insert_work(cwq, &barr->work, head, + work_color_to_flags(WORK_NO_COLOR) | linked); } -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +/** + * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * @wq: workqueue being flushed + * @flush_color: new flush color, < 0 for no-op + * @work_color: new work color, < 0 for no-op + * + * Prepare cwqs for workqueue flushing. + * + * If @flush_color is non-negative, flush_color on all cwqs should be + * -1. If no cwq has in-flight commands at the specified color, all + * cwq->flush_color's stay at -1 and %false is returned. If any cwq + * has in flight commands, its cwq->flush_color is set to + * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * wakeup logic is armed and %true is returned. + * + * The caller should have initialized @wq->first_flusher prior to + * calling this function with non-negative @flush_color. If + * @flush_color is negative, no flush color update is done and %false + * is returned. + * + * If @work_color is non-negative, all cwqs should have the same + * work_color which is previous to @work_color and all will be + * advanced to @work_color. + * + * CONTEXT: + * mutex_lock(wq->flush_mutex). + * + * RETURNS: + * %true if @flush_color >= 0 and there's something to flush. %false + * otherwise. + */ +static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, + int flush_color, int work_color) { - int active = 0; - struct wq_barrier barr; + bool wait = false; + unsigned int cpu; - WARN_ON(cwq->thread == current); - - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; + if (flush_color >= 0) { + BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); + atomic_set(&wq->nr_cwqs_to_flush, 1); } - spin_unlock_irq(&cwq->lock); - if (active) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + + spin_lock_irq(&gcwq->lock); + + if (flush_color >= 0) { + BUG_ON(cwq->flush_color != -1); + + if (cwq->nr_in_flight[flush_color]) { + cwq->flush_color = flush_color; + atomic_inc(&wq->nr_cwqs_to_flush); + wait = true; + } + } + + if (work_color >= 0) { + BUG_ON(work_color != work_next_color(cwq->work_color)); + cwq->work_color = work_color; + } + + spin_unlock_irq(&gcwq->lock); } - return active; + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + complete(&wq->first_flusher->done); + + return wait; } /** @@ -518,20 +2145,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * * We sleep until all works which were queued on entry have been handled, * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. */ void flush_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + struct wq_flusher this_flusher = { + .list = LIST_HEAD_INIT(this_flusher.list), + .flush_color = -1, + .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + }; + int next_color; - might_sleep(); lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - for_each_cpu(cpu, cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); + + mutex_lock(&wq->flush_mutex); + + /* + * Start-to-wait phase + */ + next_color = work_next_color(wq->work_color); + + if (next_color != wq->flush_color) { + /* + * Color space is not full. The current work_color + * becomes our flush_color and work_color is advanced + * by one. + */ + BUG_ON(!list_empty(&wq->flusher_overflow)); + this_flusher.flush_color = wq->work_color; + wq->work_color = next_color; + + if (!wq->first_flusher) { + /* no flush in progress, become the first flusher */ + BUG_ON(wq->flush_color != this_flusher.flush_color); + + wq->first_flusher = &this_flusher; + + if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + wq->work_color)) { + /* nothing to flush, done */ + wq->flush_color = next_color; + wq->first_flusher = NULL; + goto out_unlock; + } + } else { + /* wait in queue */ + BUG_ON(wq->flush_color == this_flusher.flush_color); + list_add_tail(&this_flusher.list, &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + } else { + /* + * Oops, color space is full, wait on overflow queue. + * The next flush completion will assign us + * flush_color and transfer to flusher_queue. + */ + list_add_tail(&this_flusher.list, &wq->flusher_overflow); + } + + mutex_unlock(&wq->flush_mutex); + + wait_for_completion(&this_flusher.done); + + /* + * Wake-up-and-cascade phase + * + * First flushers are responsible for cascading flushes and + * handling overflow. Non-first flushers can simply return. + */ + if (wq->first_flusher != &this_flusher) + return; + + mutex_lock(&wq->flush_mutex); + + /* we might have raced, check again with mutex held */ + if (wq->first_flusher != &this_flusher) + goto out_unlock; + + wq->first_flusher = NULL; + + BUG_ON(!list_empty(&this_flusher.list)); + BUG_ON(wq->flush_color != this_flusher.flush_color); + + while (true) { + struct wq_flusher *next, *tmp; + + /* complete all the flushers sharing the current flush color */ + list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { + if (next->flush_color != wq->flush_color) + break; + list_del_init(&next->list); + complete(&next->done); + } + + BUG_ON(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); + + /* this flush_color is finished, advance by one */ + wq->flush_color = work_next_color(wq->flush_color); + + /* one color has been freed, handle overflow queue */ + if (!list_empty(&wq->flusher_overflow)) { + /* + * Assign the same color to all overflowed + * flushers, advance work_color and append to + * flusher_queue. This is the start-to-wait + * phase for these overflowed flushers. + */ + list_for_each_entry(tmp, &wq->flusher_overflow, list) + tmp->flush_color = wq->work_color; + + wq->work_color = work_next_color(wq->work_color); + + list_splice_tail_init(&wq->flusher_overflow, + &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + + if (list_empty(&wq->flusher_queue)) { + BUG_ON(wq->flush_color != wq->work_color); + break; + } + + /* + * Need to flush more colors. Make the next flusher + * the new first flusher and arm cwqs. + */ + BUG_ON(wq->flush_color == wq->work_color); + BUG_ON(wq->flush_color != next->flush_color); + + list_del_init(&next->list); + wq->first_flusher = next; + + if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + break; + + /* + * Meh... this color is already done, clear first + * flusher and repeat cascading. + */ + wq->first_flusher = NULL; + } + +out_unlock: + mutex_unlock(&wq->flush_mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,43 +2304,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); */ int flush_work(struct work_struct *work) { + struct worker *worker = NULL; + struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct list_head *prev; struct wq_barrier barr; might_sleep(); - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return 0; - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. + * If it was re-queued to a different gcwq under us, we + * are not going to wait. */ smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; + cwq = get_work_cwq(work); + if (unlikely(!cwq || gcwq != cwq->gcwq)) + goto already_gone; } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; + worker = find_worker_executing_work(gcwq, work); + if (!worker) + goto already_gone; + cwq = worker->current_cwq; } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; + + insert_wq_barrier(cwq, &barr, work, worker); + spin_unlock_irq(&gcwq->lock); + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return 1; +already_gone: + spin_unlock_irq(&gcwq->lock); + return 0; } EXPORT_SYMBOL_GPL(flush_work); @@ -593,54 +2353,55 @@ EXPORT_SYMBOL_GPL(flush_work); */ static int try_to_grab_pending(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; + struct global_cwq *gcwq; int ret = -1; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return ret; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* - * This work is queued, but perhaps we locked the wrong cwq. + * This work is queued, but perhaps we locked the wrong gcwq. * In that case we must see the new value after rmb(), see * insert_work()->wmb(). */ smp_rmb(); - if (cwq == get_wq_data(work)) { + if (gcwq == get_work_gcwq(work)) { debug_work_deactivate(work); list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work)); ret = 1; } } - spin_unlock_irq(&cwq->lock); + spin_unlock_irq(&gcwq->lock); return ret; } -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) +static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) { struct wq_barrier barr; - int running = 0; + struct worker *worker; - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); - if (unlikely(running)) { + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); } @@ -648,9 +2409,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, static void wait_on_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -658,15 +2416,8 @@ static void wait_on_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu(cpu, cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + for_each_gcwq_cpu(cpu) + wait_on_cpu_work(get_gcwq(cpu), work); } static int __cancel_work_timer(struct work_struct *work, @@ -681,7 +2432,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - clear_wq_data(work); + clear_work_data(work); return ret; } @@ -727,8 +2478,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -static struct workqueue_struct *keventd_wq __read_mostly; - /** * schedule_work - put work task in global workqueue * @work: job to be done @@ -742,7 +2491,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; */ int schedule_work(struct work_struct *work) { - return queue_work(keventd_wq, work); + return queue_work(system_wq, work); } EXPORT_SYMBOL(schedule_work); @@ -755,7 +2504,7 @@ EXPORT_SYMBOL(schedule_work); */ int schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, keventd_wq, work); + return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); @@ -770,7 +2519,7 @@ EXPORT_SYMBOL(schedule_work_on); int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, dwork, delay); + return queue_delayed_work(system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -783,9 +2532,8 @@ EXPORT_SYMBOL(schedule_delayed_work); void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { - struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); - __queue_work(cwq, &dwork->work); + __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, + &dwork->work); put_cpu(); } flush_work(&dwork->work); @@ -804,7 +2552,7 @@ EXPORT_SYMBOL(flush_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); @@ -820,8 +2568,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; - int orig = -1; - struct work_struct *works; + struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) @@ -829,23 +2576,12 @@ int schedule_on_each_cpu(work_func_t func) get_online_cpus(); - /* - * When running in keventd don't schedule a work item on - * itself. Can just call directly because the work queue is - * already bound. This also is faster. - */ - if (current_is_keventd()) - orig = raw_smp_processor_id(); - for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - if (cpu != orig) - schedule_work_on(cpu, work); + schedule_work_on(cpu, work); } - if (orig >= 0) - func(per_cpu_ptr(works, orig)); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); @@ -881,7 +2617,7 @@ int schedule_on_each_cpu(work_func_t func) */ void flush_scheduled_work(void) { - flush_workqueue(keventd_wq); + flush_workqueue(system_wq); } EXPORT_SYMBOL(flush_scheduled_work); @@ -913,170 +2649,170 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); int keventd_up(void) { - return keventd_wq != NULL; + return system_wq != NULL; } -int current_is_keventd(void) +static int alloc_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long. + */ + const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, + __alignof__(unsigned long long)); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; + if (percpu) + wq->cpu_wq.pcpu = __alloc_percpu(size, align); + else { + void *ptr; - return ret; + /* + * Allocate enough room to align cwq and put an extra + * pointer at the end pointing back to the originally + * allocated pointer which will be used for free. + */ + ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); + if (ptr) { + wq->cpu_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->cpu_wq.single + 1) = ptr; + } + } + /* just in case, make sure it's actually aligned */ + BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); + return wq->cpu_wq.v ? 0 : -ENOMEM; } -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +static void free_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - return cwq; + if (percpu) + free_percpu(wq->cpu_wq.pcpu); + else if (wq->cpu_wq.single) { + /* the pointer to free is stored right after the cwq */ + kfree(*(void **)(wq->cpu_wq.single + 1)); + } } -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static int wq_clamp_max_active(int max_active, unsigned int flags, + const char *name) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; + int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - cwq->thread = p; + if (max_active < 1 || max_active > lim) + printk(KERN_WARNING "workqueue: max_active %d requested for %s " + "is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); - trace_workqueue_creation(cwq->thread, cpu); - - return 0; + return clamp_val(max_active, 1, lim); } -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +struct workqueue_struct *__alloc_workqueue_key(const char *name, + unsigned int flags, + int max_active, + struct lock_class_key *key, + const char *lock_name) { - struct task_struct *p = cwq->thread; + struct workqueue_struct *wq; + unsigned int cpu; - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} + /* + * Unbound workqueues aren't concurrency managed and should be + * dispatched to workers immediately. + */ + if (flags & WQ_UNBOUND) + flags |= WQ_HIGHPRI; -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, name); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) - return NULL; + goto err; - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + wq->flags = flags; + wq->saved_max_active = max_active; + mutex_init(&wq->flush_mutex); + atomic_set(&wq->nr_cwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->flusher_queue); + INIT_LIST_HEAD(&wq->flusher_overflow); wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; INIT_LIST_HEAD(&wq->list); - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); + if (alloc_cwqs(wq) < 0) + goto err; + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = get_gcwq(cpu); + + BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); + cwq->gcwq = gcwq; + cwq->wq = wq; + cwq->flush_color = -1; + cwq->max_active = max_active; + INIT_LIST_HEAD(&cwq->delayed_works); } - if (err) { - destroy_workqueue(wq); - wq = NULL; + if (flags & WQ_RESCUER) { + struct worker *rescuer; + + if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) + goto err; + + wq->rescuer = rescuer = alloc_worker(); + if (!rescuer) + goto err; + + rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + if (IS_ERR(rescuer->task)) + goto err; + + wq->rescuer = rescuer; + rescuer->task->flags |= PF_THREAD_BOUND; + wake_up_process(rescuer->task); } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. + * workqueue_lock protects global freeze state and workqueues + * list. Grab it, set max_active accordingly and add the new + * workqueue to workqueues list. */ - if (cwq->thread == NULL) - return; + spin_lock(&workqueue_lock); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + for_each_cwq_cpu(cpu, wq) + get_cwq(cpu, wq)->max_active = 0; - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - trace_workqueue_destruction(cwq->thread); - kthread_stop(cwq->thread); - cwq->thread = NULL; + list_add(&wq->list, &workqueues); + + spin_unlock(&workqueue_lock); + + return wq; +err: + if (wq) { + free_cwqs(wq); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + kfree(wq); + } + return NULL; } +EXPORT_SYMBOL_GPL(__alloc_workqueue_key); /** * destroy_workqueue - safely terminate a workqueue @@ -1086,72 +2822,516 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + unsigned int cpu; - cpu_maps_update_begin(); + flush_workqueue(wq); + + /* + * wq list is used to freeze wq, remove from list after + * flushing is complete in case freeze races us. + */ spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu(cpu, cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); + /* sanity check */ + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + BUG_ON(cwq->nr_in_flight[i]); + BUG_ON(cwq->nr_active); + BUG_ON(!list_empty(&cwq->delayed_works)); + } + + if (wq->flags & WQ_RESCUER) { + kthread_stop(wq->rescuer->task); + free_mayday_mask(wq->mayday_mask); + } - free_percpu(wq->cpu_wq); + free_cwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * workqueue_set_max_active - adjust max_active of a workqueue + * @wq: target workqueue + * @max_active: new max_active value. + * + * Set max_active of @wq to @max_active. + * + * CONTEXT: + * Don't call from IRQ context. + */ +void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) +{ + unsigned int cpu; + + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); + + spin_lock(&workqueue_lock); + + wq->saved_max_active = max_active; + + for_each_cwq_cpu(cpu, wq) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_irq(&gcwq->lock); + + if (!(wq->flags & WQ_FREEZEABLE) || + !(gcwq->flags & GCWQ_FREEZING)) + get_cwq(gcwq->cpu, wq)->max_active = max_active; + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); + +/** + * workqueue_congested - test whether a workqueue is congested + * @cpu: CPU in question + * @wq: target workqueue + * + * Test whether @wq's cpu workqueue for @cpu is congested. There is + * no synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * + * RETURNS: + * %true if congested, %false otherwise. + */ +bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +{ + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + return !list_empty(&cwq->delayed_works); +} +EXPORT_SYMBOL_GPL(workqueue_congested); + +/** + * work_cpu - return the last known associated cpu for @work + * @work: the work of interest + * + * RETURNS: + * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. + */ +unsigned int work_cpu(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + + return gcwq ? gcwq->cpu : WORK_CPU_NONE; +} +EXPORT_SYMBOL_GPL(work_cpu); + +/** + * work_busy - test whether a work is currently pending or running + * @work: the work to be tested + * + * Test whether @work is currently pending or running. There is no + * synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * Especially for reentrant wqs, the pending state might hide the + * running state. + * + * RETURNS: + * OR'd bitmask of WORK_BUSY_* bits. + */ +unsigned int work_busy(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long flags; + unsigned int ret = 0; + + if (!gcwq) + return false; + + spin_lock_irqsave(&gcwq->lock, flags); + + if (work_pending(work)) + ret |= WORK_BUSY_PENDING; + if (find_worker_executing_work(gcwq, work)) + ret |= WORK_BUSY_RUNNING; + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(work_busy); + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, cwq and + * gcwq which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, + * gcwqs serve mix of short, long and very long running works making + * blocked draining impractical. + * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- + */ + +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) + +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) + +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; + + BUG_ON(gcwq->cpu != smp_processor_id()); + + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); + + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; + + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); + + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); + spin_lock_irq(&gcwq->lock); + + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); + + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezeable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + int nr_works = 0; + + list_for_each_entry(work, &gcwq->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &gcwq->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(gcwq)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } + } + + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; + } + + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); + + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); + spin_unlock_irq(&gcwq->lock); + return 0; +} + +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int err = 0; + struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ case CPU_UP_PREPARE: - cpumask_set_cpu(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - err = create_workqueue_thread(cwq, cpu); - if (!err) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - err = -ENOMEM; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; } } + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + switch (action) { - case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_DYING: + /* + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + break; + case CPU_POST_DEAD: - cpumask_clear_cpu(cpu, cpu_populated_map); + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; } - return notifier_from_errno(err); + spin_unlock_irqrestore(&gcwq->lock, flags); + + return notifier_from_errno(0); } #ifdef CONFIG_SMP @@ -1201,14 +3381,199 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) EXPORT_SYMBOL_GPL(work_on_cpu); #endif /* CONFIG_SMP */ -void __init init_workqueues(void) +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all + * freezeable workqueues will queue new works to their frozen_works + * list instead of gcwq->worklist. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void freeze_workqueues_begin(void) { - alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + unsigned int cpu; - cpumask_copy(cpu_populated_map, cpu_online_mask); - singlethread_cpu = cpumask_first(cpu_possible_mask); - cpu_singlethread_map = cpumask_of(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); + spin_lock(&workqueue_lock); + + BUG_ON(workqueue_freezing); + workqueue_freezing = true; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(gcwq->flags & GCWQ_FREEZING); + gcwq->flags |= GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (cwq && wq->flags & WQ_FREEZEABLE) + cwq->max_active = 0; + } + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} + +/** + * freeze_workqueues_busy - are freezeable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases workqueue_lock. + * + * RETURNS: + * %true if some freezeable workqueues are still busy. %false if + * freezing is complete. + */ +bool freeze_workqueues_busy(void) +{ + unsigned int cpu; + bool busy = false; + + spin_lock(&workqueue_lock); + + BUG_ON(!workqueue_freezing); + + for_each_gcwq_cpu(cpu) { + struct workqueue_struct *wq; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + BUG_ON(cwq->nr_active < 0); + if (cwq->nr_active) { + busy = true; + goto out_unlock; + } + } + } +out_unlock: + spin_unlock(&workqueue_lock); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective gcwq worklists. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void thaw_workqueues(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + if (!workqueue_freezing) + goto out_unlock; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); + gcwq->flags &= ~GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + /* restore max_active and repopulate worklist */ + cwq->max_active = wq->saved_max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + workqueue_freezing = false; +out_unlock: + spin_unlock(&workqueue_lock); +} +#endif /* CONFIG_FREEZER */ + +static int __init init_workqueues(void) +{ + unsigned int cpu; + int i; + + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_init(&gcwq->lock); + INIT_LIST_HEAD(&gcwq->worklist); + gcwq->cpu = cpu; + if (cpu == WORK_CPU_UNBOUND) + gcwq->flags |= GCWQ_DISASSOCIATED; + + INIT_LIST_HEAD(&gcwq->idle_list); + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) + INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + + init_timer_deferrable(&gcwq->idle_timer); + gcwq->idle_timer.function = idle_worker_timeout; + gcwq->idle_timer.data = (unsigned long)gcwq; + + setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + (unsigned long)gcwq); + + ida_init(&gcwq->worker_ida); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); + } + + /* create the initial worker */ + for_each_online_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + + worker = create_worker(gcwq, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } + + system_wq = alloc_workqueue("events", 0, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); + return 0; } +early_initcall(init_workqueues); diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 00000000000..2d10fc98dc7 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,9 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); |