diff options
Diffstat (limited to 'kernel')
76 files changed, 2973 insertions, 2313 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 17ea6d4a9a2..a59481a3fa6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o diff --git a/kernel/audit.c b/kernel/audit.c index 1f37f15117e..f8f203e8018 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -833,7 +833,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); - s.version = AUDIT_VERSION_LATEST; + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 80f29e01557..2e0c97427b3 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -174,9 +174,9 @@ static void insert_hash(struct audit_chunk *chunk) struct fsnotify_mark *entry = &chunk->mark; struct list_head *list; - if (!entry->i.inode) + if (!entry->inode) return; - list = chunk_hash(entry->i.inode); + list = chunk_hash(entry->inode); list_add_rcu(&chunk->hash, list); } @@ -188,7 +188,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.i.inode == inode) { + if (p->mark.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -231,7 +231,7 @@ static void untag_chunk(struct node *p) new = alloc_chunk(size); spin_lock(&entry->lock); - if (chunk->dead || !entry->i.inode) { + if (chunk->dead || !entry->inode) { spin_unlock(&entry->lock); if (new) free_chunk(new); @@ -258,7 +258,7 @@ static void untag_chunk(struct node *p) goto Fallback; fsnotify_duplicate_mark(&new->mark, entry); - if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; } @@ -386,7 +386,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk_entry = &chunk->mark; spin_lock(&old_entry->lock); - if (!old_entry->i.inode) { + if (!old_entry->inode) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); fsnotify_put_mark(old_entry); @@ -395,7 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) } fsnotify_duplicate_mark(chunk_entry, old_entry); - if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->inode, NULL, 1)) { spin_unlock(&old_entry->lock); fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); @@ -611,7 +611,7 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.i.inode; + struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e420a0c41b5..c75522a8367 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1897,6 +1897,11 @@ out: audit_copy_inode(n, dentry, inode); } +void __audit_file(const struct file *file) +{ + __audit_inode(NULL, file->f_path.dentry, 0); +} + /** * __audit_inode_child - collect inode info for created/removed objects * @parent: inode of dentry parent @@ -2373,7 +2378,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->d.next = context->aux; context->aux = (void *)ax; - dentry = dget(bprm->file->f_dentry); + dentry = dget(bprm->file->f_path.dentry); get_vfs_caps_from_disk(dentry, &vcaps); dput(dentry); diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0daf7f6ae7d..a5ae60f0b0a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 00000000000..9eb4d8a7cd8 --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,156 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/mm.h> + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size, array_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* check round_up into zero and u32 overflow */ + if (elem_size == 0 || + attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + return ERR_PTR(-ENOMEM); + + array_size = sizeof(*array) + attr->max_entries * elem_size; + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elements already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_array_map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 00000000000..b3ba4367431 --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,367 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + /* prevent zero size kmalloc and check for u32 overflow */ + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + goto free_htab; + + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 00000000000..9e3414d8545 --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba61c8c1603..088ac0b1b10 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ESRCH; + err = -ENOENT; rcu_read_lock(); value = map->ops->map_lookup_elem(map, key); if (!value) @@ -190,7 +190,7 @@ err_put: return err; } -#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) { @@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr) * therefore all map accessors rely on this fact, so do the same here */ rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value); + err = map->ops->map_update_elem(map, key, value, attr->flags); rcu_read_unlock(); free_value: diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c index fcaddff4003..0ceae1e6e8b 100644 --- a/kernel/bpf/test_stub.c +++ b/kernel/bpf/test_stub.c @@ -18,26 +18,18 @@ struct bpf_context { u64 arg2; }; -static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) -{ - return 0; -} - -static struct bpf_func_proto test_funcs[] = { - [BPF_FUNC_unspec] = { - .func = test_func, - .gpl_only = true, - .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, - .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_KEY, - }, -}; - static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) { - if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: return NULL; - return &test_funcs[func_id]; + } } static const struct bpf_context_access { @@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = { .type = BPF_PROG_TYPE_UNSPEC, }; -static struct bpf_map *test_map_alloc(union bpf_attr *attr) -{ - struct bpf_map *map; - - map = kzalloc(sizeof(*map), GFP_USER); - if (!map) - return ERR_PTR(-ENOMEM); - - map->key_size = attr->key_size; - map->value_size = attr->value_size; - map->max_entries = attr->max_entries; - return map; -} - -static void test_map_free(struct bpf_map *map) -{ - kfree(map); -} - -static struct bpf_map_ops test_map_ops = { - .map_alloc = test_map_alloc, - .map_free = test_map_free, -}; - -static struct bpf_map_type_list tl_map = { - .ops = &test_map_ops, - .type = BPF_MAP_TYPE_UNSPEC, -}; - static int __init register_test_ops(void) { - bpf_register_map_type(&tl_map); bpf_register_prog_type(&tl_prog); return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f81818f294..a28e09c7825 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,22 +153,19 @@ struct reg_state { enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ - STACK_SPILL, /* 1st byte of register spilled into stack */ - STACK_SPILL_PART, /* other 7 bytes of register spill */ + STACK_SPILL, /* register spilled into stack */ STACK_MISC /* BPF program wrote some data into this slot */ }; -struct bpf_stack_slot { - enum bpf_stack_slot_type stype; - struct reg_state reg_st; -}; +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ /* state of the program: * type of all registers and stack info */ struct verifier_state { struct reg_state regs[MAX_BPF_REG]; - struct bpf_stack_slot stack[MAX_BPF_STACK]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; }; /* linked list of verifier states used to prune search */ @@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env) env->cur_state.regs[i].map_ptr->key_size, env->cur_state.regs[i].map_ptr->value_size); } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (env->cur_state.stack[i].stype == STACK_SPILL) + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (env->cur_state.stack_slot_type[i] == STACK_SPILL) verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[env->cur_state.stack[i].reg_st.type]); + reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); } verbose("\n"); } @@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size) static int check_stack_write(struct verifier_state *state, int off, int size, int value_regno) { - struct bpf_stack_slot *slot; int i; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ if (value_regno >= 0 && (state->regs[value_regno].type == PTR_TO_MAP_VALUE || @@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size, state->regs[value_regno].type == PTR_TO_CTX)) { /* register containing pointer is being spilled into stack */ - if (size != 8) { + if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } - slot = &state->stack[MAX_BPF_STACK + off]; - slot->stype = STACK_SPILL; /* save register state */ - slot->reg_st = state->regs[value_regno]; - for (i = 1; i < 8; i++) { - slot = &state->stack[MAX_BPF_STACK + off + i]; - slot->stype = STACK_SPILL_PART; - slot->reg_st.type = UNKNOWN_VALUE; - slot->reg_st.map_ptr = NULL; - } - } else { + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + state->regs[value_regno]; + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { /* regular write of data into stack */ - for (i = 0; i < size; i++) { - slot = &state->stack[MAX_BPF_STACK + off + i]; - slot->stype = STACK_MISC; - slot->reg_st.type = UNKNOWN_VALUE; - slot->reg_st.map_ptr = NULL; - } + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; } return 0; } @@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size, static int check_stack_read(struct verifier_state *state, int off, int size, int value_regno) { + u8 *slot_type; int i; - struct bpf_stack_slot *slot; - slot = &state->stack[MAX_BPF_STACK + off]; + slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; - if (slot->stype == STACK_SPILL) { - if (size != 8) { + if (slot_type[0] == STACK_SPILL) { + if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } - for (i = 1; i < 8; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != - STACK_SPILL_PART) { + for (i = 1; i < BPF_REG_SIZE; i++) { + if (slot_type[i] != STACK_SPILL) { verbose("corrupted spill memory\n"); return -EACCES; } @@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size, if (value_regno >= 0) /* restore register state from stack */ - state->regs[value_regno] = slot->reg_st; + state->regs[value_regno] = + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; return 0; } else { for (i = 0; i < size; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != - STACK_MISC) { + if (slot_type[i] != STACK_MISC) { verbose("invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env, } for (i = 0; i < access_size; i++) { - if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur) } for (i = 0; i < MAX_BPF_STACK; i++) { - if (memcmp(&old->stack[i], &cur->stack[i], - sizeof(old->stack[0])) != 0) { - if (old->stack[i].stype == STACK_INVALID) - continue; + if (old->stack_slot_type[i] == STACK_INVALID) + continue; + if (old->stack_slot_type[i] != cur->stack_slot_type[i]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ return false; - } + if (i % BPF_REG_SIZE) + continue; + if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + sizeof(old->spilled_regs[0]))) + /* when explored and current stack slot types are + * the same, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * but current path has stored: + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + else + continue; } return true; } @@ -1664,8 +1741,10 @@ process_bpf_exit: u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { - verbose("LD_ABS is not supported yet\n"); - return -EINVAL; + err = check_ld_abs(env, insn); + if (err) + return err; + } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 136eceadeed..bb263d0caab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!(cgrp->root->subsys_mask & (1 << ss->id))) return NULL; + /* + * This function is used while updating css associations and thus + * can't test the csses directly. Use ->child_subsys_mask. + */ while (cgroup_parent(cgrp) && !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) cgrp = cgroup_parent(cgrp); @@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, return cgroup_css(cgrp, ss); } +/** + * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest + * + * Find and get the effective css of @cgrp for @ss. The effective css is + * defined as the matching css of the nearest ancestor including self which + * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, + * the root css is returned, so this function always returns a valid css. + * The returned css must be put using css_put(). + */ +struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + do { + css = cgroup_css(cgrp, ss); + + if (css && css_tryget_online(css)) + goto out_unlock; + cgrp = cgroup_parent(cgrp); + } while (cgrp); + + css = init_css_set.subsys[ss->id]; + css_get(css); +out_unlock: + rcu_read_unlock(); + return css; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * cgroup_calc_child_subsys_mask - calculate child_subsys_mask * @cgrp: the target cgroup + * @subtree_control: the new subtree_control mask to consider * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * - * This function determines which subsystems need to be enabled given the - * current @cgrp->subtree_control and records it in - * @cgrp->child_subsys_mask. The resulting mask is always a superset of - * @cgrp->subtree_control and follows the usual hierarchy rules. + * This function calculates which subsystems need to be enabled if + * @subtree_control is to be applied to @cgrp. The returned mask is always + * a superset of @subtree_control and follows the usual hierarchy rules. */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, + unsigned int subtree_control) { struct cgroup *parent = cgroup_parent(cgrp); - unsigned int cur_ss_mask = cgrp->subtree_control; + unsigned int cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) { - cgrp->child_subsys_mask = cur_ss_mask; - return; - } + if (!cgroup_on_dfl(cgrp)) + return cur_ss_mask; while (true) { unsigned int new_ss_mask = cur_ss_mask; @@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) cur_ss_mask = new_ss_mask; } - cgrp->child_subsys_mask = cur_ss_mask; + return cur_ss_mask; +} + +/** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * Update @cgrp->child_subsys_mask according to the current + * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ + cgrp->child_subsys_mask = + cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); } /** @@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, loff_t off) { unsigned int enable = 0, disable = 0; - unsigned int css_enable, css_disable, old_ctrl, new_ctrl; + unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, ret = -ENOENT; goto out_unlock; } - - /* - * @ss is already enabled through dependency and - * we'll just make it visible. Skip draining. - */ - if (cgrp->child_subsys_mask & (1 << ssid)) - continue; - - /* - * Because css offlining is asynchronous, userland - * might try to re-enable the same controller while - * the previous instance is still around. In such - * cases, wait till it's gone using offline_waitq. - */ - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } } else if (disable & (1 << ssid)) { if (!(cgrp->subtree_control & (1 << ssid))) { disable &= ~(1 << ssid); @@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, * subsystems than specified may need to be enabled or disabled * depending on subsystem dependencies. */ - cgrp->subtree_control |= enable; - cgrp->subtree_control &= ~disable; + old_sc = cgrp->subtree_control; + old_ss = cgrp->child_subsys_mask; + new_sc = (old_sc | enable) & ~disable; + new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - old_ctrl = cgrp->child_subsys_mask; - cgroup_refresh_child_subsys_mask(cgrp); - new_ctrl = cgrp->child_subsys_mask; - - css_enable = ~old_ctrl & new_ctrl; - css_disable = old_ctrl & ~new_ctrl; + css_enable = ~old_ss & new_ss; + css_disable = old_ss & ~new_ss; enable |= css_enable; disable |= css_disable; /* + * Because css offlining is asynchronous, userland might try to + * re-enable the same controller while the previous instance is + * still around. In such cases, wait till it's gone using + * offline_waitq. + */ + for_each_subsys(ss, ssid) { + if (!(css_enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + DEFINE_WAIT(wait); + + if (!cgroup_css(child, ss)) + continue; + + cgroup_get(child); + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + cgroup_kn_unlock(of->kn); + schedule(); + finish_wait(&child->offline_waitq, &wait); + cgroup_put(child); + + return restart_syscall(); + } + } + + cgrp->subtree_control = new_sc; + cgrp->child_subsys_mask = new_ss; + + /* * Create new csses or make the existing ones visible. A css is * created invisible if it's being implicitly enabled through * dependency. An invisible css is made visible when the userland @@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, } } + /* + * The effective csses of all the descendants (excluding @cgrp) may + * have changed. Subsystems can optionally subscribe to this event + * by implementing ->css_e_css_changed() which is invoked if any of + * the effective csses seen from the css's cgroup may have changed. + */ + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); + struct cgroup_subsys_state *css; + + if (!ss->css_e_css_changed || !this_css) + continue; + + css_for_each_descendant_pre(css, this_css) + if (css != this_css) + ss->css_e_css_changed(css); + } + kernfs_activate(cgrp->kn); ret = 0; out_unlock: @@ -2832,9 +2898,8 @@ out_unlock: return ret ?: nbytes; err_undo_css: - cgrp->subtree_control &= ~enable; - cgrp->subtree_control |= disable; - cgroup_refresh_child_subsys_mask(cgrp); + cgrp->subtree_control = old_sc; + cgrp->child_subsys_mask = old_ss; for_each_subsys(ss, ssid) { if (!(enable & (1 << ssid))) @@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ cgroup_idr_remove(&ss->css_idr, css->id); + if (ss->css_released) + ss->css_released(css); } else { /* cgroup release path */ cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 723cfc9d0ad..64b257f6bca 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global mutexes guarding cpuset structures - cpuset_mutex - * and callback_mutex. The latter may nest inside the former. We also - * require taking task_lock() when dereferencing a task's cpuset pointer. - * See "The task_lock() exception", at the end of this comment. + * There are two global locks guarding cpuset structures - cpuset_mutex and + * callback_lock. We also require taking task_lock() when dereferencing a + * task's cpuset pointer. See "The task_lock() exception", at the end of this + * comment. * - * A task must hold both mutexes to modify cpusets. If a task holds + * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it - * is the only task able to also acquire callback_mutex and be able to + * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while * just holding cpuset_mutex. While it is performing these checks, various - * callback routines can briefly acquire callback_mutex to query cpusets. - * Once it is ready to make the changes, it takes callback_mutex, blocking + * callback routines can briefly acquire callback_lock to query cpusets. + * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. * * Calls to the kernel memory allocator can not be made while holding - * callback_mutex, as that would risk double tripping on callback_mutex + * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * - * If a task is only holding callback_mutex, then it has read-only + * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_mutex across + * The cpuset_common_file_read() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { */ static DEFINE_MUTEX(cpuset_mutex); -static DEFINE_MUTEX(callback_mutex); +static DEFINE_SPINLOCK(callback_lock); /* * CPU / memory hotplug is handled asynchronously. @@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { @@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_mutex held. + * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Called with callback_mutex/cpuset_mutex held + * Call with callback_lock or cpuset_mutex held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->cpus_allowed as a temp variable */ update_cpumasks_hier(cs, trialcs->cpus_allowed); @@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) continue; rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && !nodes_equal(cp->mems_allowed, cp->effective_mems)); @@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_mutex during call. + * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &cs->mems_allowed); @@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); @@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) count = seq_get_buf(sf, &buf); s = buf; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: @@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) seq_commit(sf, -1); } out_unlock: - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); return ret; } @@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(cs->css.cgroup)) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); return 0; @@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { mutex_lock(&cpuset_mutex); - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (cgroup_on_dfl(root_css->cgroup)) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); @@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); mutex_unlock(&cpuset_mutex); } @@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, { bool is_empty; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, @@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); if (cpus_updated) update_tasks_cpumask(cs); @@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); cpumask_copy(top_cpuset.effective_cpus, &new_cpus); - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - mutex_lock(&callback_mutex); + spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; - mutex_unlock(&callback_mutex); + spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } @@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void) void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { - mutex_lock(&callback_mutex); + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); } void cpuset_cpus_allowed_fallback(struct task_struct *tsk) @@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { nodemask_t mask; + unsigned long flags; - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return mask; } @@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding - * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) @@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) } /** - * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * flag, yes. * Otherwise, no. * - * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to - * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() - * might sleep, and might allow a node from an enclosing cpuset. - * - * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall - * cpusets, and never sleeps. - * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for @@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * - * Scanning up parent cpusets requires callback_mutex. The + * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the - * cpuset are short of memory, might require taking the callback_mutex - * mutex. + * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, @@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. - * - * Rule: - * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you - * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables - * the code that might scan up ancestor cpusets and sleep. */ -int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +int __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ + unsigned long flags; if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; - might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; /* @@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - mutex_lock(&callback_mutex); + spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); - mutex_unlock(&callback_mutex); + spin_unlock_irqrestore(&callback_lock, flags); return allowed; } -/* - * cpuset_node_allowed_hardwall - Can we allocate on a memory node? - * @node: is this an allowed node? - * @gfp_mask: memory allocation flags - * - * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is - * set, yes, we can always allocate. If node is in our task's mems_allowed, - * yes. If the task has been OOM killed and has access to memory reserves as - * specified by the TIF_MEMDIE flag, yes. - * Otherwise, no. - * - * The __GFP_THISNODE placement logic is really handled elsewhere, - * by forcibly using a zonelist starting at a specified node, and by - * (in get_page_from_freelist()) refusing to consider the zones for - * any node on the zonelist except the first. By the time any such - * calls get to this routine, we should just shut up and say 'yes'. - * - * Unlike the cpuset_node_allowed_softwall() variant, above, - * this variant requires that the node be in the current task's - * mems_allowed or that we're in interrupt. It does not scan up the - * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. - * It never sleeps. - */ -int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) -{ - if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) - return 1; - if (node_isset(node, current->mems_allowed)) - return 1; - /* - * Allow tasks that have access to memory reserves because they have - * been OOM killed to get memory anywhere. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; - return 0; -} - /** * cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_slab_spread_node() - On which node to begin search for a slab page diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e19d3ebc29..4c1ee7f2beb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -614,7 +614,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - css = css_tryget_online_from_dir(f.file->f_dentry, + css = css_tryget_online_from_dir(f.file->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -7477,11 +7477,11 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { synchronize_rcu(); - perf_install_in_context(ctx, group_leader, event->cpu); + perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_install_in_context(ctx, sibling, event->cpu); + perf_install_in_context(ctx, sibling, sibling->cpu); get_ctx(ctx); } } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ed8f2cde34c..cb346f26a22 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); + ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); @@ -724,14 +724,14 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) int more = 0; again: - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (!valid_vma(vma, is_register)) continue; if (!prev && !more) { /* - * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through + * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails. */ prev = kmalloc(sizeof(struct map_info), @@ -755,7 +755,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) info->mm = vma->vm_mm; info->vaddr = offset_to_vaddr(vma, offset); } - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); if (!more) goto out; diff --git a/kernel/exit.c b/kernel/exit.c index 232c4bc8bcc..1ea4369890a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,13 +118,10 @@ static void __exit_signal(struct task_struct *tsk) } /* - * Accumulate here the counters for all threads but the group leader - * as they die, so they can be added into the process-wide totals - * when those are taken. The group leader stays around as a zombie as - * long as there are other threads. When it gets reaped, the exit.c - * code will add its counts into these totals. We won't ever get here - * for the group leader, since it will have been the last reference on - * the signal_struct. + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); @@ -215,27 +212,6 @@ repeat: } /* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -462,6 +438,44 @@ static void exit_mm(struct task_struct *tsk) clear_thread_flag(TIF_MEMDIE); } +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + +static struct task_struct *find_child_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + + if (likely(reaper != father)) + return reaper; + + reaper = find_alive_thread(father); + if (reaper) { + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: father->exit_code); + } + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; +} + /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists @@ -469,58 +483,36 @@ static void exit_mm(struct task_struct *tsk) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; + struct task_struct *thread, *reaper; - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; + thread = find_alive_thread(father); + if (thread) return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { - struct task_struct *reaper; + if (father->signal->has_child_subreaper) { /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We start from father to ensure we can not look into another + * namespace, this is safe because all its threads are dead. */ - for (reaper = father->real_parent; - reaper != &init_task; + for (reaper = father; + !same_thread_group(reaper, child_reaper); reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + /* call_usermodehelper() descendants need this check */ + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { - if (!(thread->flags & PF_EXITING)) - return reaper; - } while_each_thread(reaper, thread); + thread = find_alive_thread(reaper); + if (thread) + return thread; } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -529,15 +521,7 @@ static struct task_struct *find_new_reaper(struct task_struct *father) static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ @@ -548,49 +532,53 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, dead); - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; + /* Can drop and reacquire tasklist_lock */ + reaper = find_child_reaper(father); + if (list_empty(&father->children)) + return; - do { + reaper = find_new_reaper(father, reaper); + list_for_each_entry(p, &father->children, sibling) { + for_each_thread(p, t) { t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + BUG_ON((!t->ptrace) != (t->parent == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); + } + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, dead); } + list_splice_tail_init(&father->children, &reaper->children); } /* @@ -600,18 +588,12 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -629,15 +611,18 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (tsk->exit_state == EXIT_DEAD) + list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } #ifdef CONFIG_DEBUG_STACK_USAGE @@ -982,8 +967,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, retval, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; @@ -1008,21 +992,25 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) } return wait_noreap_copyout(wo, p, pid, uid, why, status); } - - traced = ptrace_reparented(p); /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD; + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * We own this thread, nobody else can reap it. */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + read_unlock(&tasklist_lock); + sched_annotate_sleep(); + + /* + * Check thread_group_leader() to exclude the traced sub-threads. + */ + if (state == EXIT_DEAD && thread_group_leader(p)) { + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; cputime_t tgutime, tgstime; @@ -1034,21 +1022,20 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-theads + * which can reap other children at the same time. Until + * we change k_getrusage()-like users to rely on this lock + * we have to take ->siglock as well. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + spin_lock_irq(¤t->sighand->siglock); write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; @@ -1073,16 +1060,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock(&psig->stats_lock); - spin_unlock_irq(&p->real_parent->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we its state == DEAD/TRACE. - */ - read_unlock(&tasklist_lock); - sched_annotate_sleep(); - retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) diff --git a/kernel/extable.c b/kernel/extable.c index d8a6446adbc..c98f926277a 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -18,6 +18,7 @@ #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/module.h> +#include <linux/ftrace.h> #include <linux/mutex.h> #include <linux/init.h> @@ -102,6 +103,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; + if (is_ftrace_trampoline(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -119,7 +122,9 @@ int kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; - return is_module_text_address(addr); + if (is_module_text_address(addr)) + return 1; + return is_ftrace_trampoline(addr); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 9ca84189cfc..4dc2ddade9f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -433,7 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) atomic_inc(&mapping->i_mmap_writable); flush_dcache_mmap_lock(mapping); @@ -445,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); } /* diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3b7408759bd..c92e44855dd 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -32,10 +32,13 @@ config GCOV_KERNEL Note that the debugfs filesystem has to be mounted to access profiling data. +config ARCH_HAS_GCOV_PROFILE_ALL + def_bool n + config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64 + depends on ARCH_HAS_GCOV_PROFILE_ALL default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/groups.c b/kernel/groups.c index 451698f86cf..664411f171b 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,6 +6,7 @@ #include <linux/slab.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/user_namespace.h> #include <asm/uaccess.h> /* init to 2 - one for init_task, one to ensure it is never freed */ @@ -213,6 +214,14 @@ out: return i; } +bool may_setgroups(void) +{ + struct user_namespace *user_ns = current_user_ns(); + + return ns_capable(user_ns, CAP_SETGID) && + userns_may_setgroups(user_ns); +} + /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. @@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 3ab9048483f..cbf9fb899d9 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { - struct llist_head *raised = &__get_cpu_var(raised_list); + struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); - irq_work_run_list(&__get_cpu_var(lazy_list)); + irq_work_run_list(this_cpu_ptr(&lazy_list)); } /* diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a6..9a8a01abbae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -600,7 +600,7 @@ kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, if (!kexec_on_panic) { image->swap_page = kimage_alloc_control_pages(image, 0); if (!image->swap_page) { - pr_err(KERN_ERR "Could not allocate swap buffer\n"); + pr_err("Could not allocate swap buffer\n"); goto out_free_control_pages; } } diff --git a/kernel/kmod.c b/kernel/kmod.c index 80f7a6d0051..2777f40a9c7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -47,13 +47,6 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; -/* - * kmod_thread_locker is used for deadlock avoidance. There is no explicit - * locking to protect this global - it is private to the singleton khelper - * thread and should only ever be modified by that thread. - */ -static const struct task_struct *kmod_thread_locker; - #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -223,7 +216,6 @@ static void umh_complete(struct subprocess_info *sub_info) static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; - int wait = sub_info->wait & ~UMH_KILLABLE; struct cred *new; int retval; @@ -267,20 +259,13 @@ static int ____call_usermodehelper(void *data) out: sub_info->retval = retval; /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ - if (wait != UMH_WAIT_PROC) + if (!(sub_info->wait & UMH_WAIT_PROC)) umh_complete(sub_info); if (!retval) return 0; do_exit(0); } -static int call_helper(void *data) -{ - /* Worker thread started blocking khelper thread. */ - kmod_thread_locker = current; - return ____call_usermodehelper(data); -} - /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -323,21 +308,14 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - /* CLONE_VFORK: wait until the usermode helper has execve'd - * successfully We need the data structures to stay around - * until that is done. */ - if (wait == UMH_WAIT_PROC) + if (sub_info->wait & UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else { - pid = kernel_thread(call_helper, sub_info, - CLONE_VFORK | SIGCHLD); - /* Worker thread stopped blocking khelper thread. */ - kmod_thread_locker = NULL; - } + else + pid = kernel_thread(____call_usermodehelper, sub_info, + SIGCHLD); if (pid < 0) { sub_info->retval = pid; @@ -571,17 +549,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) goto out; } /* - * Worker thread must not wait for khelper thread at below - * wait_for_completion() if the thread was created with CLONE_VFORK - * flag, for khelper thread is already waiting for the thread at - * wait_for_completion() in do_fork(). - */ - if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { - retval = -EBUSY; - goto out; - } - - /* * Set the completion pointer only if there is a waiter. * This makes it possible to use umh_complete to free * the data structure in case of UMH_NO_WAIT. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3995f546d0f..06f58309fed 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -915,7 +915,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef CONFIG_KPROBES_ON_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { .func = kprobe_ftrace_handler, - .flags = FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY, }; static int kprobe_ftrace_enabled; @@ -1410,16 +1410,10 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -static int check_kprobe_address_safe(struct kprobe *p, - struct module **probed_mod) +int __weak arch_check_ftrace_location(struct kprobe *p) { - int ret = 0; unsigned long ftrace_addr; - /* - * If the address is located on a ftrace nop, set the - * breakpoint to the following instruction. - */ ftrace_addr = ftrace_location((unsigned long)p->addr); if (ftrace_addr) { #ifdef CONFIG_KPROBES_ON_FTRACE @@ -1431,7 +1425,17 @@ static int check_kprobe_address_safe(struct kprobe *p, return -EINVAL; #endif } + return 0; +} +static int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) +{ + int ret; + + ret = arch_check_ftrace_location(p); + if (ret) + return ret; jump_label_lock(); preempt_disable(); diff --git a/kernel/module.c b/kernel/module.c index e52a8739361..3965511ae13 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -42,7 +42,6 @@ #include <linux/vermagic.h> #include <linux/notifier.h> #include <linux/sched.h> -#include <linux/stop_machine.h> #include <linux/device.h> #include <linux/string.h> #include <linux/mutex.h> @@ -98,7 +97,7 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete and add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -158,13 +157,13 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; -int register_module_notifier(struct notifier_block * nb) +int register_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&module_notify_list, nb); } EXPORT_SYMBOL(register_module_notifier); -int unregister_module_notifier(struct notifier_block * nb) +int unregister_module_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&module_notify_list, nb); } @@ -628,18 +627,23 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); +/* MODULE_REF_BASE is the base reference count by kmodule loader. */ +#define MODULE_REF_BASE 1 + /* Init the unload section of the module. */ static int module_unload_init(struct module *mod) { - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) - return -ENOMEM; + /* + * Initialize reference counter to MODULE_REF_BASE. + * refcnt == 0 means module is going. + */ + atomic_set(&mod->refcnt, MODULE_REF_BASE); INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); /* Hold reference count during initialization. */ - raw_cpu_write(mod->refptr->incs, 1); + atomic_inc(&mod->refcnt); return 0; } @@ -721,8 +725,6 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); - - free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -740,60 +742,39 @@ static inline int try_force_unload(unsigned int flags) } #endif /* CONFIG_MODULE_FORCE_UNLOAD */ -struct stopref +/* Try to release refcount of module, 0 means success. */ +static int try_release_module_ref(struct module *mod) { - struct module *mod; - int flags; - int *forced; -}; + int ret; -/* Whole machine is stopped with interrupts off when this runs. */ -static int __try_stop_module(void *_sref) -{ - struct stopref *sref = _sref; + /* Try to decrement refcnt which we set at loading */ + ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt); + BUG_ON(ret < 0); + if (ret) + /* Someone can put this right now, recover with checking */ + ret = atomic_add_unless(&mod->refcnt, MODULE_REF_BASE, 0); + + return ret; +} +static int try_stop_module(struct module *mod, int flags, int *forced) +{ /* If it's not unused, quit unless we're forcing. */ - if (module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force_unload(sref->flags))) + if (try_release_module_ref(mod) != 0) { + *forced = try_force_unload(flags); + if (!(*forced)) return -EWOULDBLOCK; } /* Mark it as dying. */ - sref->mod->state = MODULE_STATE_GOING; - return 0; -} - -static int try_stop_module(struct module *mod, int flags, int *forced) -{ - struct stopref sref = { mod, flags, forced }; + mod->state = MODULE_STATE_GOING; - return stop_machine(__try_stop_module, &sref, NULL); + return 0; } unsigned long module_refcount(struct module *mod) { - unsigned long incs = 0, decs = 0; - int cpu; - - for_each_possible_cpu(cpu) - decs += per_cpu_ptr(mod->refptr, cpu)->decs; - /* - * ensure the incs are added up after the decs. - * module_put ensures incs are visible before decs with smp_wmb. - * - * This 2-count scheme avoids the situation where the refcount - * for CPU0 is read, then CPU0 increments the module refcount, - * then CPU1 drops that refcount, then the refcount for CPU1 is - * read. We would record a decrement but not its corresponding - * increment so we would see a low count (disaster). - * - * Rare situation? But module_refcount can be preempted, and we - * might be tallying up 4096+ CPUs. So it is not impossible. - */ - smp_rmb(); - for_each_possible_cpu(cpu) - incs += per_cpu_ptr(mod->refptr, cpu)->incs; - return incs - decs; + return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -877,8 +858,10 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) seq_printf(m, " %lu ", module_refcount(mod)); - /* Always include a trailing , so userspace can differentiate - between this and the old multi-field proc format. */ + /* + * Always include a trailing , so userspace can differentiate + * between this and the old multi-field proc format. + */ list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; seq_printf(m, "%s,", use->source->name); @@ -886,11 +869,11 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) if (mod->init != NULL && mod->exit == NULL) { printed_something = 1; - seq_printf(m, "[permanent],"); + seq_puts(m, "[permanent],"); } if (!printed_something) - seq_printf(m, "-"); + seq_puts(m, "-"); } void __symbol_put(const char *symbol) @@ -935,7 +918,7 @@ void __module_get(struct module *module) { if (module) { preempt_disable(); - __this_cpu_inc(module->refptr->incs); + atomic_inc(&module->refcnt); trace_module_get(module, _RET_IP_); preempt_enable(); } @@ -948,11 +931,11 @@ bool try_module_get(struct module *module) if (module) { preempt_disable(); - - if (likely(module_is_live(module))) { - __this_cpu_inc(module->refptr->incs); + /* Note: here, we can fail to get a reference */ + if (likely(module_is_live(module) && + atomic_inc_not_zero(&module->refcnt) != 0)) trace_module_get(module, _RET_IP_); - } else + else ret = false; preempt_enable(); @@ -963,11 +946,12 @@ EXPORT_SYMBOL(try_module_get); void module_put(struct module *module) { + int ret; + if (module) { preempt_disable(); - smp_wmb(); /* see comment in module_refcount */ - __this_cpu_inc(module->refptr->decs); - + ret = atomic_dec_if_positive(&module->refcnt); + WARN_ON(ret < 0); /* Failed to put refcount */ trace_module_put(module, _RET_IP_); preempt_enable(); } @@ -978,7 +962,7 @@ EXPORT_SYMBOL(module_put); static inline void print_unload_info(struct seq_file *m, struct module *mod) { /* We don't know the usage count, or what modules are using. */ - seq_printf(m, " - -"); + seq_puts(m, " - -"); } static inline void module_unload_free(struct module *mod) @@ -1131,7 +1115,7 @@ static unsigned long maybe_relocated(unsigned long crc, static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1165,7 +1149,7 @@ static int check_version(Elf_Shdr *sechdrs, return 0; bad_version: - printk("%s: disagrees about version of symbol %s\n", + pr_warn("%s: disagrees about version of symbol %s\n", mod->name, symname); return 0; } @@ -1200,7 +1184,7 @@ static inline int same_magic(const char *amagic, const char *bmagic, static inline int check_version(Elf_Shdr *sechdrs, unsigned int versindex, const char *symname, - struct module *mod, + struct module *mod, const unsigned long *crc, const struct module *crc_owner) { @@ -1288,15 +1272,13 @@ static inline bool sect_empty(const Elf_Shdr *sect) return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; } -struct module_sect_attr -{ +struct module_sect_attr { struct module_attribute mattr; char *name; unsigned long address; }; -struct module_sect_attrs -{ +struct module_sect_attrs { struct attribute_group grp; unsigned int nsections; struct module_sect_attr attrs[0]; @@ -1550,7 +1532,8 @@ static int module_add_modinfo_attrs(struct module *mod) (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); sysfs_attr_init(&temp_attr->attr); - error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj, + &temp_attr->attr); ++temp_attr; } } @@ -1566,7 +1549,7 @@ static void module_remove_modinfo_attrs(struct module *mod) /* pick a field to test for end of list */ if (!attr->attr.name) break; - sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + sysfs_remove_file(&mod->mkobj.kobj, &attr->attr); if (attr->free) attr->free(mod); } @@ -1697,18 +1680,6 @@ static void mod_sysfs_teardown(struct module *mod) mod_sysfs_fini(mod); } -/* - * unlink the module with the whole machine is stopped with interrupts off - * - this defends against kallsyms not taking locks - */ -static int __unlink_module(void *_mod) -{ - struct module *mod = _mod; - list_del(&mod->list); - module_bug_cleanup(mod); - return 0; -} - #ifdef CONFIG_DEBUG_SET_MODULE_RONX /* * LKM RO/NX protection: protect module's text/ro-data @@ -1860,7 +1831,12 @@ static void free_module(struct module *mod) /* Now we can delete it from the lists */ mutex_lock(&module_mutex); - stop_machine(__unlink_module, mod, NULL); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + /* Remove this module from bug list, this uses list_del_rcu */ + module_bug_cleanup(mod); + /* Wait for RCU synchronizing before releasing mod->list and buglist. */ + synchronize_rcu(); mutex_unlock(&module_mutex); /* This may be NULL, but that's OK */ @@ -1955,7 +1931,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) /* We compiled with -fno-common. These are not supposed to happen. */ pr_debug("Common symbol: %s\n", name); - printk("%s: please compile with -fno-common\n", + pr_warn("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; break; @@ -2259,7 +2235,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum) { const Elf_Shdr *sec; @@ -2735,7 +2711,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) * This shouldn't happen with same compiler and binutils * building all parts of the module. */ - printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + pr_warn("%s: has both .ctors and .init_array.\n", mod->name); return -EINVAL; } @@ -3023,8 +2999,10 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* Init routine failed: abort. Try to protect us from - buggy refcounters. */ + /* + * Init routine failed: abort. Try to protect us from + * buggy refcounters. + */ mod->state = MODULE_STATE_GOING; synchronize_sched(); module_put(mod); @@ -3202,7 +3180,7 @@ out: static int unknown_module_param_cb(char *param, char *val, const char *modname) { - /* Check for magic 'dyndbg' arg */ + /* Check for magic 'dyndbg' arg */ int ret = ddebug_dyndbg_module_param_cb(param, val, modname); if (ret != 0) pr_warn("%s: unknown parameter '%s' ignored\n", modname, param); @@ -3352,6 +3330,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); wake_up_all(&module_wq); + /* Wait for RCU synchronizing before releasing mod->list. */ + synchronize_rcu(); mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); @@ -3685,8 +3665,8 @@ static int m_show(struct seq_file *m, void *p) /* Informative for users. */ seq_printf(m, " %s", - mod->state == MODULE_STATE_GOING ? "Unloading": - mod->state == MODULE_STATE_COMING ? "Loading": + mod->state == MODULE_STATE_GOING ? "Unloading" : + mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ seq_printf(m, " 0x%pK", mod->module_core); @@ -3695,7 +3675,7 @@ static int m_show(struct seq_file *m, void *p) if (mod->taints) seq_printf(m, " %s", module_flags(mod, buf)); - seq_printf(m, "\n"); + seq_puts(m, "\n"); return 0; } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ef42d0ab311..49746c81ad8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -220,11 +220,10 @@ void exit_task_namespaces(struct task_struct *p) SYSCALL_DEFINE2(setns, int, fd, int, nstype) { - const struct proc_ns_operations *ops; struct task_struct *tsk = current; struct nsproxy *new_nsproxy; - struct proc_ns *ei; struct file *file; + struct ns_common *ns; int err; file = proc_ns_fget(fd); @@ -232,9 +231,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) return PTR_ERR(file); err = -EINVAL; - ei = get_proc_ns(file_inode(file)); - ops = ei->ns_ops; - if (nstype && (ops->type != nstype)) + ns = get_proc_ns(file_inode(file)); + if (nstype && (ns->ops->type != nstype)) goto out; new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); @@ -243,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } - err = ops->install(new_nsproxy, ei->ns); + err = ns->ops->install(new_nsproxy, ns); if (err) { free_nsproxy(new_nsproxy); goto out; diff --git a/kernel/panic.c b/kernel/panic.c index cf80672b792..4d8d6f906de 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); static bool crash_kexec_post_notifiers; +int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, if (args) vprintk(args->fmt, args->args); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } + print_modules(); dump_stack(); print_oops_end_marker(); @@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +core_param(panic_on_warn, panic_on_warn, int, 0644); static int __init setup_crash_kexec_post_notifiers(char *s) { diff --git a/kernel/params.c b/kernel/params.c index db97b791390..0af9b2c4e56 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -603,74 +603,67 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, const struct kernel_param *kp, const char *name) { - struct module_param_attrs *new; - struct attribute **attrs; - int err, num; + struct module_param_attrs *new_mp; + struct attribute **new_attrs; + unsigned int i; /* We don't bother calling this with invisible parameters. */ BUG_ON(!kp->perm); if (!mk->mp) { - num = 0; - attrs = NULL; - } else { - num = mk->mp->num; - attrs = mk->mp->grp.attrs; + /* First allocation. */ + mk->mp = kzalloc(sizeof(*mk->mp), GFP_KERNEL); + if (!mk->mp) + return -ENOMEM; + mk->mp->grp.name = "parameters"; + /* NULL-terminated attribute array. */ + mk->mp->grp.attrs = kzalloc(sizeof(mk->mp->grp.attrs[0]), + GFP_KERNEL); + /* Caller will cleanup via free_module_param_attrs */ + if (!mk->mp->grp.attrs) + return -ENOMEM; } - /* Enlarge. */ - new = krealloc(mk->mp, - sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), - GFP_KERNEL); - if (!new) { - kfree(attrs); - err = -ENOMEM; - goto fail; - } - /* Despite looking like the typical realloc() bug, this is safe. - * We *want* the old 'attrs' to be freed either way, and we'll store - * the new one in the success case. */ - attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); - if (!attrs) { - err = -ENOMEM; - goto fail_free_new; - } + /* Enlarge allocations. */ + new_mp = krealloc(mk->mp, + sizeof(*mk->mp) + + sizeof(mk->mp->attrs[0]) * (mk->mp->num + 1), + GFP_KERNEL); + if (!new_mp) + return -ENOMEM; + mk->mp = new_mp; - /* Sysfs wants everything zeroed. */ - memset(new, 0, sizeof(*new)); - memset(&new->attrs[num], 0, sizeof(new->attrs[num])); - memset(&attrs[num], 0, sizeof(attrs[num])); - new->grp.name = "parameters"; - new->grp.attrs = attrs; + /* Extra pointer for NULL terminator */ + new_attrs = krealloc(mk->mp->grp.attrs, + sizeof(mk->mp->grp.attrs[0]) * (mk->mp->num + 2), + GFP_KERNEL); + if (!new_attrs) + return -ENOMEM; + mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ - sysfs_attr_init(&new->attrs[num].mattr.attr); - new->attrs[num].param = kp; - new->attrs[num].mattr.show = param_attr_show; - new->attrs[num].mattr.store = param_attr_store; - new->attrs[num].mattr.attr.name = (char *)name; - new->attrs[num].mattr.attr.mode = kp->perm; - new->num = num+1; + sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); + mk->mp->attrs[mk->mp->num].param = kp; + mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; + /* Do not allow runtime DAC changes to make param writable. */ + if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; + mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; + mk->mp->num++; /* Fix up all the pointers, since krealloc can move us */ - for (num = 0; num < new->num; num++) - new->grp.attrs[num] = &new->attrs[num].mattr.attr; - new->grp.attrs[num] = NULL; - - mk->mp = new; + for (i = 0; i < mk->mp->num; i++) + mk->mp->grp.attrs[i] = &mk->mp->attrs[i].mattr.attr; + mk->mp->grp.attrs[mk->mp->num] = NULL; return 0; - -fail_free_new: - kfree(new); -fail: - mk->mp = NULL; - return err; } #ifdef CONFIG_MODULES static void free_module_param_attrs(struct module_kobject *mk) { - kfree(mk->mp->grp.attrs); + if (mk->mp) + kfree(mk->mp->grp.attrs); kfree(mk->mp); mk->mp = NULL; } @@ -695,8 +688,10 @@ int module_param_sysfs_setup(struct module *mod, if (kparam[i].perm == 0) continue; err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); - if (err) + if (err) { + free_module_param_attrs(&mod->mkobj); return err; + } params = true; } diff --git a/kernel/pid.c b/kernel/pid.c index 9b9a2669814..cd36a5e0d17 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -79,7 +79,10 @@ struct pid_namespace init_pid_ns = { .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, + .ns.inum = PROC_PID_INIT_INO, +#ifdef CONFIG_PID_NS + .ns.ops = &pidns_operations, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -341,6 +344,8 @@ out: out_unlock: spin_unlock_irq(&pidmap_lock); + put_pid_ns(ns); + out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index db95d8eb761..a65ba137fd1 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -105,9 +105,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_map; - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) goto out_free_map; + ns->ns.ops = &pidns_operations; kref_init(&ns->kref); ns->level = level; @@ -142,7 +143,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) { int i; - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); for (i = 0; i < PIDMAP_ENTRIES; i++) kfree(ns->pidmap[i].page); put_user_ns(ns->user_ns); @@ -190,7 +191,11 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); - /* Ignore SIGCHLD causing any terminated children to autoreap */ + /* + * Ignore SIGCHLD causing any terminated children to autoreap. + * This speeds up the namespace shutdown, plus see the comment + * below. + */ spin_lock_irq(&me->sighand->siglock); me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; spin_unlock_irq(&me->sighand->siglock); @@ -223,15 +228,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); - /* Firstly reap the EXIT_ZOMBIE children we may have. */ + /* + * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD. + * sys_wait4() will also block until our children traced from the + * parent namespace are detached and become EXIT_DEAD. + */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); /* - * sys_wait4() above can't reap the TASK_DEAD children. - * Make sure they all go away, see free_pid(). + * sys_wait4() above can't reap the EXIT_DEAD children but we do not + * really care, we could reparent them to the global init. We could + * exit and reap ->child_reaper even if it is not the last thread in + * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(), + * pid_ns can not go away until proc_kill_sb() drops the reference. + * + * But this ns can also have other tasks injected by setns()+fork(). + * Again, ignoring the user visible semantics we do not really need + * to wait until they are all reaped, but they can be reparented to + * us and thus we need to ensure that pid->child_reaper stays valid + * until they all go away. See free_pid()->wake_up_process(). + * + * We rely on ignored SIGCHLD, an injected zombie must be autoreaped + * if reparented. */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -313,7 +334,12 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static void *pidns_get(struct task_struct *task) +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + +static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -323,18 +349,18 @@ static void *pidns_get(struct task_struct *task) get_pid_ns(ns); rcu_read_unlock(); - return ns; + return ns ? &ns->ns : NULL; } -static void pidns_put(void *ns) +static void pidns_put(struct ns_common *ns) { - put_pid_ns(ns); + put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, void *ns) +static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = ns; + struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -362,19 +388,12 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int pidns_inum(void *ns) -{ - struct pid_namespace *pid_ns = ns; - return pid_ns->proc_inum; -} - const struct proc_ns_operations pidns_operations = { .name = "pid", .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, - .inum = pidns_inum, }; static __init int pid_namespaces_init(void) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index bbef57f5bdf..6e7708c2c21 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -94,6 +94,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS + select PM_RUNTIME config PM_SLEEP_SMP def_bool y @@ -131,7 +132,6 @@ config PM_WAKELOCKS_GC config PM_RUNTIME bool "Run-time PM core functionality" - depends on !IA64_HP_SIM ---help--- Enable functionality allowing I/O devices to be put into energy-saving (low power) states at run time (or autosuspended) after a specified @@ -298,14 +298,9 @@ config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS -config PM_GENERIC_DOMAINS_RUNTIME - def_bool y - depends on PM_RUNTIME && PM_GENERIC_DOMAINS - config PM_GENERIC_DOMAINS_OF def_bool y depends on PM_GENERIC_DOMAINS && OF config CPU_PM bool - depends on SUSPEND || CPU_IDLE diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f35a3478f3..2329daae525 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -28,6 +28,7 @@ #include <linux/syscore_ops.h> #include <linux/ctype.h> #include <linux/genhd.h> +#include <linux/ktime.h> #include <trace/events/power.h> #include "power.h" @@ -232,20 +233,17 @@ static void platform_recover(int platform_mode) * @nr_pages: Number of memory pages processed between @start and @stop. * @msg: Additional diagnostic message to print. */ -void swsusp_show_speed(struct timeval *start, struct timeval *stop, - unsigned nr_pages, char *msg) +void swsusp_show_speed(ktime_t start, ktime_t stop, + unsigned nr_pages, char *msg) { + ktime_t diff; u64 elapsed_centisecs64; unsigned int centisecs; unsigned int k; unsigned int kps; - elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); - /* - * If "(s64)elapsed_centisecs64 < 0", it will print long elapsed time, - * it is obvious enough for what went wrong. - */ - do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + diff = ktime_sub(stop, start); + elapsed_centisecs64 = ktime_divns(diff, 10*NSEC_PER_MSEC); centisecs = elapsed_centisecs64; if (centisecs == 0) centisecs = 1; /* avoid div-by-zero */ diff --git a/kernel/power/power.h b/kernel/power/power.h index 2df883a9d3c..ce9b8328a68 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -174,8 +174,7 @@ extern int hib_wait_on_bio_chain(struct bio **bio_chain); struct timeval; /* kernel/power/swsusp.c */ -extern void swsusp_show_speed(struct timeval *, struct timeval *, - unsigned int, char *); +extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 791a61892bb..0c40c16174b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/compiler.h> +#include <linux/ktime.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -1576,11 +1577,11 @@ int hibernate_preallocate_memory(void) struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; unsigned long alloc, save_highmem, pages_highmem, avail_normal; - struct timeval start, stop; + ktime_t start, stop; int error; printk(KERN_INFO "PM: Preallocating image memory... "); - do_gettimeofday(&start); + start = ktime_get(); error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); if (error) @@ -1709,9 +1710,9 @@ int hibernate_preallocate_memory(void) free_unnecessary_pages(); out: - do_gettimeofday(&stop); + stop = ktime_get(); printk(KERN_CONT "done (allocated %lu pages)\n", pages); - swsusp_show_speed(&start, &stop, pages, "Allocated"); + swsusp_show_speed(start, stop, pages, "Allocated"); return 0; diff --git a/kernel/power/swap.c b/kernel/power/swap.c index aaa3261dea5..570aff81754 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -30,6 +30,7 @@ #include <linux/atomic.h> #include <linux/kthread.h> #include <linux/crc32.h> +#include <linux/ktime.h> #include "power.h" @@ -445,8 +446,8 @@ static int save_image(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n", nr_to_write); @@ -455,7 +456,7 @@ static int save_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); while (1) { ret = snapshot_read_next(snapshot); if (ret <= 0) @@ -469,12 +470,12 @@ static int save_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); return ret; } @@ -580,8 +581,8 @@ static int save_image_lzo(struct swap_map_handle *handle, int nr_pages; int err2; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; size_t off; unsigned thr, run_threads, nr_threads; unsigned char *page = NULL; @@ -674,7 +675,7 @@ static int save_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for (;;) { for (thr = 0; thr < nr_threads; thr++) { for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { @@ -759,12 +760,12 @@ static int save_image_lzo(struct swap_map_handle *handle, out_finish: err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) printk(KERN_INFO "PM: Image saving done.\n"); - swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); out_clean: if (crc) { if (crc->thr) @@ -965,8 +966,8 @@ static int load_image(struct swap_map_handle *handle, { unsigned int m; int ret = 0; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; struct bio *bio; int err2; unsigned nr_pages; @@ -978,7 +979,7 @@ static int load_image(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); for ( ; ; ) { ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -996,7 +997,7 @@ static int load_image(struct swap_map_handle *handle, nr_pages++; } err2 = hib_wait_on_bio_chain(&bio); - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) ret = err2; if (!ret) { @@ -1005,7 +1006,7 @@ static int load_image(struct swap_map_handle *handle, if (!snapshot_image_loaded(snapshot)) ret = -ENODATA; } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); return ret; } @@ -1067,8 +1068,8 @@ static int load_image_lzo(struct swap_map_handle *handle, int ret = 0; int eof = 0; struct bio *bio; - struct timeval start; - struct timeval stop; + ktime_t start; + ktime_t stop; unsigned nr_pages; size_t off; unsigned i, thr, run_threads, nr_threads; @@ -1190,7 +1191,7 @@ static int load_image_lzo(struct swap_map_handle *handle, m = 1; nr_pages = 0; bio = NULL; - do_gettimeofday(&start); + start = ktime_get(); ret = snapshot_write_next(snapshot); if (ret <= 0) @@ -1343,7 +1344,7 @@ out_finish: wait_event(crc->done, atomic_read(&crc->stop)); atomic_set(&crc->stop, 0); } - do_gettimeofday(&stop); + stop = ktime_get(); if (!ret) { printk(KERN_INFO "PM: Image loading done.\n"); snapshot_write_finalize(snapshot); @@ -1359,7 +1360,7 @@ out_finish: } } } - swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + swsusp_show_speed(start, stop, nr_to_read, "Read"); out_clean: for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); @@ -1374,7 +1375,7 @@ out_clean: kthread_stop(data[thr].thr); vfree(data); } - if (page) vfree(page); + vfree(page); return ret; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ced2b84b1cb..02d6b6d2879 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -62,9 +62,6 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; -/* Deferred messaged from sched code are marked by this special level */ -#define SCHED_MESSAGE_LOGLEVEL -2 - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -480,7 +477,7 @@ static int syslog_action_restricted(int type) type != SYSLOG_ACTION_SIZE_BUFFER; } -static int check_syslog_permissions(int type, bool from_file) +int check_syslog_permissions(int type, bool from_file) { /* * If this is from /proc/kmsg and we've already opened it, then we've @@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; - static int saved_console_loglevel = -1; + static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, from_file); @@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) + if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { + if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ @@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; error = 0; break; /* Number of chars in the log buffer */ @@ -1627,10 +1624,10 @@ asmlinkage int vprintk_emit(int facility, int level, int printed_len = 0; bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ - static volatile unsigned int logbuf_cpu = UINT_MAX; + static unsigned int logbuf_cpu = UINT_MAX; - if (level == SCHED_MESSAGE_LOGLEVEL) { - level = -1; + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; in_sched = true; } @@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *end_of_header = printk_skip_level(text); switch (kern_level) { case '0' ... '7': - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; + /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } @@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, } } - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dict) @@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, -1, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1807,6 +1805,30 @@ asmlinkage int printk_emit(int facility, int level, } EXPORT_SYMBOL(printk_emit); +int vprintk_default(const char *fmt, va_list args) +{ + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + r = vkdb_printf(fmt, args); + return r; + } +#endif + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + + return r; +} +EXPORT_SYMBOL_GPL(vprintk_default); + +/* + * This allows printk to be diverted to another function per cpu. + * This is useful for calling printk functions from within NMI + * without worrying about race conditions that can lock up the + * box. + */ +DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default; + /** * printk - print a kernel message * @fmt: format string @@ -1830,19 +1852,21 @@ EXPORT_SYMBOL(printk_emit); */ asmlinkage __visible int printk(const char *fmt, ...) { + printk_func_t vprintk_func; va_list args; int r; -#ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { - va_start(args, fmt); - r = vkdb_printf(fmt, args); - va_end(args); - return r; - } -#endif va_start(args, fmt); - r = vprintk_emit(0, -1, NULL, 0, fmt, args); + + /* + * If a caller overrides the per_cpu printk_func, then it needs + * to disable preemption when calling printk(). Otherwise + * the printk_func should be set to the default. No need to + * disable preemption here. + */ + vprintk_func = this_cpu_read(printk_func); + r = vprintk_func(fmt, args); + va_end(args); return r; @@ -1876,28 +1900,28 @@ static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev, bool syslog, char *buf, size_t size) { return 0; } static size_t cont_print_text(char *text, size_t size) { return 0; } +/* Still needs to be defined for users */ +DEFINE_PER_CPU(printk_func_t, printk_func); + #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; + char buf[512]; + int n; + + if (!early_console) + return; va_start(ap, fmt); - early_vprintk(fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); + + early_console->write(early_console, buf, n); } #endif @@ -2634,7 +2658,7 @@ int printk_deferred(const char *fmt, ...) preempt_disable(); va_start(args, fmt); - r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c..1eb9d90c3af 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. + * for writing. */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); + list_add(&p->ptrace_entry, dead); } - - write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a..00000000000 --- a/kernel/res_counter.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov <xemul@openvz.org> - * - */ - -#include <linux/types.h> -#include <linux/parser.h> -#include <linux/fs.h> -#include <linux/res_counter.h> -#include <linux/uaccess.h> -#include <linux/mm.h> - -void res_counter_init(struct res_counter *counter, struct res_counter *parent) -{ - spin_lock_init(&counter->lock); - counter->limit = RES_COUNTER_MAX; - counter->soft_limit = RES_COUNTER_MAX; - counter->parent = parent; -} - -static u64 res_counter_uncharge_locked(struct res_counter *counter, - unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - -static int res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force) -{ - int ret = 0; - - if (counter->usage + val > counter->limit) { - counter->failcnt++; - ret = -ENOMEM; - if (!force) - return ret; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return ret; -} - -static int __res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, bool force) -{ - int ret, r; - unsigned long flags; - struct res_counter *c, *u; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val, force); - spin_unlock(&c->lock); - if (r < 0 && !ret) { - ret = r; - *limit_fail_at = c; - if (!force) - break; - } - } - - if (ret < 0 && !force) { - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); - } - } - local_irq_restore(flags); - - return ret; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, false); -} - -int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, true); -} - -u64 res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) -{ - unsigned long flags; - struct res_counter *c; - u64 ret = 0; - - local_irq_save(flags); - for (c = counter; c != top; c = c->parent) { - u64 r; - spin_lock(&c->lock); - r = res_counter_uncharge_locked(c, val); - if (c == counter) - ret = r; - spin_unlock(&c->lock); - } - local_irq_restore(flags); - return ret; -} - -u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - return res_counter_uncharge_until(counter, NULL, val); -} - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -#if BITS_PER_LONG == 32 -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&counter->lock, flags); - ret = *res_counter_member(counter, member); - spin_unlock_irqrestore(&counter->lock, flags); - - return ret; -} -#else -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} -#endif - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *resp) -{ - char *end; - unsigned long long res; - - /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ - if (*buf == '-') { - int rc = kstrtoull(buf + 1, 10, &res); - - if (rc) - return rc; - if (res != 1) - return -EINVAL; - *resp = RES_COUNTER_MAX; - return 0; - } - - res = memparse(buf, &end); - if (*end != '\0') - return -EINVAL; - - if (PAGE_ALIGN(res) >= res) - res = PAGE_ALIGN(res); - else - res = RES_COUNTER_MAX; - - *resp = res; - - return 0; -} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bb398c0c5f0..b5797b78add 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4527,8 +4527,10 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif + ppid = 0; rcu_read_lock(); - ppid = task_pid_nr(rcu_dereference(p->real_parent)); + if (pid_alive(p)) + ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), ppid, diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 00fe55cc5a8..b6e4c16377c 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -25,6 +25,38 @@ void print_stack_trace(struct stack_trace *trace, int spaces) } EXPORT_SYMBOL_GPL(print_stack_trace); +int snprint_stack_trace(char *buf, size_t size, + struct stack_trace *trace, int spaces) +{ + int i; + unsigned long ip; + int generated; + int total = 0; + + if (WARN_ON(!trace->entries)) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + ip = trace->entries[i]; + generated = snprintf(buf, size, "%*c[<%p>] %pS\n", + 1 + spaces, ' ', (void *) ip, (void *) ip); + + total += generated; + + /* Assume that generated isn't a negative number */ + if (generated >= size) { + buf += size; + size = 0; + } else { + buf += generated; + size -= generated; + } + } + + return total; +} +EXPORT_SYMBOL_GPL(snprint_stack_trace); + /* * Architectures that do not implement save_stack_trace_tsk or * save_stack_trace_regs get this weak alias and a once-per-bootup warning diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 02aa4185b17..5adcb0ae3a5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -169,6 +169,8 @@ cond_syscall(ppc_rtas); cond_syscall(sys_spu_run); cond_syscall(sys_spu_create); cond_syscall(sys_subpage_prot); +cond_syscall(sys_s390_pci_mmio_read); +cond_syscall(sys_s390_pci_mmio_write); /* mmu depending weak syscall entries */ cond_syscall(sys_mprotect); @@ -224,3 +226,6 @@ cond_syscall(sys_seccomp); /* access BPF programs and maps */ cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 15f2511a1b7..137c7f69b26 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -623,6 +623,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tracepoint_printk", + .data = &tracepoint_printk, + .maxlen = sizeof(tracepoint_printk), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_KEXEC { @@ -1104,6 +1111,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a296..7e7746a42a6 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, {} }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b312fcc7302..670fff88a96 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -459,7 +459,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) stats = nla_data(na); memset(stats, 0, sizeof(*stats)); - rc = cgroupstats_build(stats, f.file->f_dentry); + rc = cgroupstats_build(stats, f.file->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2e949cc9c9f..b79f39bda7e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -792,7 +792,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) /* Initialize mult/shift and max_idle_ns */ __clocksource_updatefreq_scale(cs, scale, freq); - /* Add clocksource to the clcoksource list */ + /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1f4356037a7..4d54b754058 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -235,7 +235,7 @@ void tick_nohz_full_kick(void) if (!tick_nohz_full_cpu(smp_processor_id())) return; - irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); + irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* diff --git a/kernel/time/time.c b/kernel/time/time.c index 65015ff2f07..6390517e77d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -741,6 +741,7 @@ u64 nsecs_to_jiffies64(u64 n) return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); #endif } +EXPORT_SYMBOL(nsecs_to_jiffies64); /** * nsecs_to_jiffies - Convert nsecs in u64 to jiffies diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 67d6369ddf8..979ccde2672 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o -ifeq ($(CONFIG_PM_RUNTIME),y) +ifeq ($(CONFIG_PM),y) obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o endif ifeq ($(CONFIG_TRACING),y) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c1bd4ada2a0..483cecfa5c1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1142,9 +1142,9 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); -static int blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1154,33 +1154,33 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act) fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, - "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", - MAJOR(t->device), MINOR(t->device), iter->cpu, - secs, nsec_rem, iter->ent->pid, act, rwbs); + trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); } -static int blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { const unsigned char *pdu_buf; int pdu_len; - int i, end, ret; + int i, end; pdu_buf = pdu_start(ent); pdu_len = te_blk_io_trace(ent)->pdu_len; if (!pdu_len) - return 1; + return; /* find the last zero that needs to be printed */ for (end = pdu_len - 1; end >= 0; end--) @@ -1188,119 +1188,107 @@ static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) break; end++; - if (!trace_seq_putc(s, '(')) - return 0; + trace_seq_putc(s, '('); for (i = 0; i < pdu_len; i++) { - ret = trace_seq_printf(s, "%s%02x", - i == 0 ? "" : " ", pdu_buf[i]); - if (!ret) - return ret; + trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); /* * stop when the rest is just zeroes and indicate so * with a ".." appended */ - if (i == end && end != pdu_len - 1) - return trace_seq_puts(s, " ..) "); + if (i == end && end != pdu_len - 1) { + trace_seq_puts(s, " ..) "); + return; + } } - return trace_seq_puts(s, ") "); + trace_seq_puts(s, ") "); } -static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = trace_seq_printf(s, "%u ", t_bytes(ent)); - if (!ret) - return 0; - ret = blk_log_dump_pdu(s, ent); - if (!ret) - return 0; - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "%u ", t_bytes(ent)); + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", + trace_seq_printf(s, "%llu + %u [%s]\n", t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + else + trace_seq_printf(s, "[%s]\n", cmd); } } -static int blk_log_with_error(struct trace_seq *s, +static void blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - int ret; - - ret = blk_log_dump_pdu(s, ent); - if (ret) - return trace_seq_printf(s, "[%d]\n", t_error(ent)); - return 0; + blk_log_dump_pdu(s, ent); + trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", - t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", - t_sector(ent), t_error(ent)); + trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + else + trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); } } -static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); - return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), t_sec(ent), - MAJOR(r.device_from), MINOR(r.device_from), - (unsigned long long)r.sector_from); + trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } -static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + trace_seq_printf(s, "[%s]\n", cmd); } -static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); } -static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), cmd); } -static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) { - int ret; const struct blk_io_trace *t = te_blk_io_trace(ent); - ret = trace_seq_putmem(s, t + 1, t->pdu_len); - if (ret) - return trace_seq_putc(s, '\n'); - return ret; + trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putc(s, '\n'); } /* @@ -1339,7 +1327,7 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - int (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1364,7 +1352,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; const struct blk_io_trace *t; u16 what; - int ret; bool long_act; blk_log_action_t *log_action; @@ -1374,21 +1361,18 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, log_action = classic ? &blk_log_action_classic : &blk_log_action; if (t->action == BLK_TN_MESSAGE) { - ret = log_action(iter, long_act ? "message" : "m"); - if (ret) - ret = blk_log_msg(s, iter->ent); - goto out; + log_action(iter, long_act ? "message" : "m"); + blk_log_msg(s, iter->ent); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Unknown action %x\n", what); + trace_seq_printf(s, "Unknown action %x\n", what); else { - ret = log_action(iter, what2act[what].act[long_act]); - if (ret) - ret = what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act]); + what2act[what].print(s, iter->ent); } -out: - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, @@ -1397,7 +1381,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, return print_one_line(iter, false); } -static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; @@ -1407,18 +1391,18 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) .time = iter->ts, }; - if (!trace_seq_putmem(s, &old, offset)) - return 0; - return trace_seq_putmem(s, &t->sector, - sizeof(old) - offset + t->pdu_len); + trace_seq_putmem(s, &old, offset); + trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); } static enum print_line_t blk_trace_event_print_binary(struct trace_iterator *iter, int flags, struct trace_event *event) { - return blk_trace_synthesize_old_trace(iter) ? - TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + blk_trace_synthesize_old_trace(iter); + + return trace_handle_return(&iter->seq); } static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) @@ -1493,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) if (atomic_dec_and_test(&blk_probes_ref)) blk_unregister_tracepoints(); - spin_lock_irq(&running_trace_lock); - list_del(&bt->running_list); - spin_unlock_irq(&running_trace_lock); blk_trace_free(bt); return 0; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 31c90fec415..929a733d302 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -387,6 +387,8 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, return ret; } +static void ftrace_update_trampoline(struct ftrace_ops *ops); + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ops->flags & FTRACE_OPS_FL_DELETED) @@ -416,9 +418,13 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (control_ops_alloc(ops)) return -ENOMEM; add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); + /* The control_ops needs the trampoline update */ + ops = &control_ops; } else add_ftrace_ops(&ftrace_ops_list, ops); + ftrace_update_trampoline(ops); + if (ftrace_enabled) update_ftrace_function(); @@ -565,13 +571,13 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function " - "Hit Time Avg s^2\n" - " -------- " - "--- ---- --- ---\n"); + seq_puts(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); #else - seq_printf(m, " Function Hit\n" - " -------- ---\n"); + seq_puts(m, " Function Hit\n" + " -------- ---\n"); #endif return 0; } @@ -598,7 +604,7 @@ static int function_stat_show(struct seq_file *m, void *v) seq_printf(m, " %-30.30s %10lu", str, rec->counter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " "); + seq_puts(m, " "); avg = rec->time; do_div(avg, rec->counter); @@ -1111,6 +1117,43 @@ static struct ftrace_ops global_ops = { FTRACE_OPS_FL_INITIALIZED, }; +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + struct ftrace_ops *op; + bool ret = false; + + /* + * Some of the ops may be dynamically allocated, + * they are freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + + do_for_each_ftrace_op(op, ftrace_ops_list) { + /* + * This is to check for dynamically allocated trampolines. + * Trampolines that are in kernel text will have + * core_kernel_text() return true. + */ + if (op->trampoline && op->trampoline_size) + if (addr >= op->trampoline && + addr < op->trampoline + op->trampoline_size) { + ret = true; + goto out; + } + } while_for_each_ftrace_op(op); + + out: + preempt_enable_notrace(); + + return ret; +} + struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; @@ -1315,6 +1358,9 @@ ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash); static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash); + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) @@ -1325,8 +1371,13 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash *new_hash; int size = src->count; int bits = 0; + int ret; int i; + /* Reject setting notrace hash on IPMODIFY ftrace_ops */ + if (ops->flags & FTRACE_OPS_FL_IPMODIFY && !enable) + return -EINVAL; + /* * If the new source is empty, just free dst and assign it * the empty_hash. @@ -1360,6 +1411,16 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, } update: + /* Make sure this can be applied if it is IPMODIFY ftrace_ops */ + if (enable) { + /* IPMODIFY should be updated only when filter_hash updating */ + ret = ftrace_hash_ipmodify_update(ops, new_hash); + if (ret < 0) { + free_ftrace_hash(new_hash); + return ret; + } + } + /* * Remove the current set, update the hash and add * them back. @@ -1724,6 +1785,114 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, ftrace_hash_rec_update_modify(ops, filter_hash, 1); } +/* + * Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK + * or no-needed to update, -EBUSY if it detects a conflict of the flag + * on a ftrace_rec, and -EINVAL if the new_hash tries to trace all recs. + * Note that old_hash and new_hash has below meanings + * - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected) + * - If the hash is EMPTY_HASH, it hits nothing + * - Anything else hits the recs which match the hash entries. + */ +static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, + struct ftrace_hash *old_hash, + struct ftrace_hash *new_hash) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec, *end = NULL; + int in_old, in_new; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return 0; + + if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY)) + return 0; + + /* + * Since the IPMODIFY is a very address sensitive action, we do not + * allow ftrace_ops to set all functions to new hash. + */ + if (!new_hash || !old_hash) + return -EINVAL; + + /* Update rec->flags */ + do_for_each_ftrace_rec(pg, rec) { + /* We need to update only differences of filter_hash */ + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) { + /* New entries must ensure no others are using it */ + if (rec->flags & FTRACE_FL_IPMODIFY) + goto rollback; + rec->flags |= FTRACE_FL_IPMODIFY; + } else /* Removed entry */ + rec->flags &= ~FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + + return 0; + +rollback: + end = rec; + + /* Roll back what we did above */ + do_for_each_ftrace_rec(pg, rec) { + if (rec == end) + goto err_out; + + in_old = !!ftrace_lookup_ip(old_hash, rec->ip); + in_new = !!ftrace_lookup_ip(new_hash, rec->ip); + if (in_old == in_new) + continue; + + if (in_new) + rec->flags &= ~FTRACE_FL_IPMODIFY; + else + rec->flags |= FTRACE_FL_IPMODIFY; + } while_for_each_ftrace_rec(); + +err_out: + return -EBUSY; +} + +static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); +} + +/* Disabling always succeeds */ +static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(hash)) + hash = NULL; + + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); +} + +static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, + struct ftrace_hash *new_hash) +{ + struct ftrace_hash *old_hash = ops->func_hash->filter_hash; + + if (ftrace_hash_empty(old_hash)) + old_hash = NULL; + + if (ftrace_hash_empty(new_hash)) + new_hash = NULL; + + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); +} + static void print_ip_ins(const char *fmt, unsigned char *p) { int i; @@ -1734,10 +1903,13 @@ static void print_ip_ins(const char *fmt, unsigned char *p) printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); } +static struct ftrace_ops * +ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); + /** * ftrace_bug - report and shutdown function tracer * @failed: The failed type (EFAULT, EINVAL, EPERM) - * @ip: The address that failed + * @rec: The record that failed * * The arch code that enables or disables the function tracing * can call ftrace_bug() when it has detected a problem in @@ -1746,8 +1918,10 @@ static void print_ip_ins(const char *fmt, unsigned char *p) * EINVAL - if what is read at @ip is not what was expected * EPERM - if the problem happens on writting to the @ip address */ -void ftrace_bug(int failed, unsigned long ip) +void ftrace_bug(int failed, struct dyn_ftrace *rec) { + unsigned long ip = rec ? rec->ip : 0; + switch (failed) { case -EFAULT: FTRACE_WARN_ON_ONCE(1); @@ -1759,7 +1933,7 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace failed to modify "); print_ip_sym(ip); print_ip_ins(" actual: ", (unsigned char *)ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1771,6 +1945,24 @@ void ftrace_bug(int failed, unsigned long ip) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + if (rec) { + struct ftrace_ops *ops = NULL; + + pr_info("ftrace record flags: %lx\n", rec->flags); + pr_cont(" (%ld)%s", ftrace_rec_count(rec), + rec->flags & FTRACE_FL_REGS ? " R" : " "); + if (rec->flags & FTRACE_FL_TRAMP_EN) { + ops = ftrace_find_tramp_ops_any(rec); + if (ops) + pr_cont("\ttramp: %pS", + (void *)ops->trampoline); + else + pr_cont("\ttramp: ERROR!"); + + } + ip = ftrace_get_addr_curr(rec); + pr_cont(" expected tramp: %lx\n", ip); + } } static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) @@ -2093,7 +2285,7 @@ void __weak ftrace_replace_code(int enable) do_for_each_ftrace_rec(pg, rec) { failed = __ftrace_replace_code(rec, enable); if (failed) { - ftrace_bug(failed, rec->ip); + ftrace_bug(failed, rec); /* Stop processing */ return; } @@ -2175,17 +2367,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) static int ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) { - unsigned long ip; int ret; - ip = rec->ip; - if (unlikely(ftrace_disabled)) return 0; ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { - ftrace_bug(ret, ip); + ftrace_bug(ret, rec); return 0; } return 1; @@ -2320,6 +2509,10 @@ static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, static ftrace_func_t saved_ftrace_func; static int ftrace_start_up; +void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) +{ +} + static void control_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); @@ -2369,6 +2562,15 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) */ ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING; + ret = ftrace_hash_ipmodify_enable(ops); + if (ret < 0) { + /* Rollback registration process */ + __unregister_ftrace_function(ops); + ftrace_start_up--; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + return ret; + } + ftrace_hash_rec_enable(ops, 1); ftrace_startup_enable(command); @@ -2397,6 +2599,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) */ WARN_ON_ONCE(ftrace_start_up < 0); + /* Disabling ipmodify never fails */ + ftrace_hash_ipmodify_disable(ops); ftrace_hash_rec_disable(ops, 1); ops->flags &= ~FTRACE_OPS_FL_ENABLED; @@ -2471,6 +2675,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { schedule_on_each_cpu(ftrace_sync); + arch_ftrace_trampoline_free(ops); + if (ops->flags & FTRACE_OPS_FL_CONTROL) control_ops_free(ops); } @@ -2623,7 +2829,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) if (ftrace_start_up && cnt) { int failed = __ftrace_replace_code(p, 1); if (failed) - ftrace_bug(failed, p->ip); + ftrace_bug(failed, p); } } } @@ -2948,6 +3154,22 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(&ftrace_lock); } +void * __weak +arch_ftrace_trampoline_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ + return NULL; +} + +static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, + struct dyn_ftrace *rec) +{ + void *ptr; + + ptr = arch_ftrace_trampoline_func(ops, rec); + if (ptr) + seq_printf(m, " ->%pS", ptr); +} + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -2958,9 +3180,9 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) - seq_printf(m, "#### no functions disabled ####\n"); + seq_puts(m, "#### no functions disabled ####\n"); else - seq_printf(m, "#### all functions enabled ####\n"); + seq_puts(m, "#### all functions enabled ####\n"); return 0; } @@ -2971,22 +3193,25 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - seq_printf(m, " (%ld)%s", + struct ftrace_ops *ops = NULL; + + seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), - rec->flags & FTRACE_FL_REGS ? " R" : " "); + rec->flags & FTRACE_FL_REGS ? " R" : " ", + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { - struct ftrace_ops *ops; - ops = ftrace_find_tramp_ops_any(rec); if (ops) seq_printf(m, "\ttramp: %pS", (void *)ops->trampoline); else - seq_printf(m, "\ttramp: ERROR!"); + seq_puts(m, "\ttramp: ERROR!"); + } + add_trampoline_func(m, ops, rec); } - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -3020,9 +3245,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file) { struct ftrace_iterator *iter; - if (unlikely(ftrace_disabled)) - return -ENODEV; - iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); if (iter) { iter->pg = ftrace_pages_start; @@ -3975,6 +4197,9 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); +static unsigned long save_global_trampoline; +static unsigned long save_global_flags; + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -4183,9 +4408,9 @@ static int g_show(struct seq_file *m, void *v) struct ftrace_graph_data *fgd = m->private; if (fgd->table == ftrace_graph_funcs) - seq_printf(m, "#### all functions enabled ####\n"); + seq_puts(m, "#### all functions enabled ####\n"); else - seq_printf(m, "#### no functions disabled ####\n"); + seq_puts(m, "#### no functions disabled ####\n"); return 0; } @@ -4696,6 +4921,32 @@ void __init ftrace_init(void) ftrace_disabled = 1; } +/* Do nothing if arch does not support this */ +void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ + +/* + * Currently there's no safe way to free a trampoline when the kernel + * is configured with PREEMPT. That is because a task could be preempted + * when it jumped to the trampoline, it may be preempted for a long time + * depending on the system load, and currently there's no way to know + * when it will be off the trampoline. If the trampoline is freed + * too early, when the task runs again, it will be executing on freed + * memory and crash. + */ +#ifdef CONFIG_PREEMPT + /* Currently, only non dynamic ops can have a trampoline */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + return; +#endif + + arch_ftrace_update_trampoline(ops); +} + #else static struct ftrace_ops global_ops = { @@ -4738,6 +4989,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) return 1; } +static void ftrace_update_trampoline(struct ftrace_ops *ops) +{ +} + #endif /* CONFIG_DYNAMIC_FTRACE */ __init void ftrace_init_global_array_ops(struct trace_array *tr) @@ -5075,12 +5330,12 @@ static int fpid_show(struct seq_file *m, void *v) const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); if (v == (void *)1) { - seq_printf(m, "no pid\n"); + seq_puts(m, "no pid\n"); return 0; } if (fpid->pid == ftrace_swapper_pid) - seq_printf(m, "swapper tasks\n"); + seq_puts(m, "swapper tasks\n"); else seq_printf(m, "%u\n", pid_vnr(fpid->pid)); @@ -5293,6 +5548,7 @@ static struct ftrace_ops graph_ops = { FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, + /* trampoline_size is only needed for dynamically allocated tramps */ #endif ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; @@ -5522,7 +5778,6 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, update_function_graph_func(); ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET); - out: mutex_unlock(&ftrace_lock); return ret; @@ -5543,6 +5798,17 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); +#ifdef CONFIG_DYNAMIC_FTRACE + /* + * Function graph does not allocate the trampoline, but + * other global_ops do. We need to reset the ALLOC_TRAMP flag + * if one was used. + */ + global_ops.trampoline = save_global_trampoline; + if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) + global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; +#endif + out: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a56e07c8d15..7a4104cb95c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -34,21 +34,19 @@ static void update_pages_handler(struct work_struct *work); */ int ring_buffer_print_entry_header(struct trace_seq *s) { - int ret; - - ret = trace_seq_puts(s, "# compressed entry header\n"); - ret = trace_seq_puts(s, "\ttype_len : 5 bits\n"); - ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n"); - ret = trace_seq_puts(s, "\tarray : 32 bits\n"); - ret = trace_seq_putc(s, '\n'); - ret = trace_seq_printf(s, "\tpadding : type == %d\n", - RINGBUF_TYPE_PADDING); - ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", - RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata max type_len == %d\n", - RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + trace_seq_puts(s, "# compressed entry header\n"); + trace_seq_puts(s, "\ttype_len : 5 bits\n"); + trace_seq_puts(s, "\ttime_delta : 27 bits\n"); + trace_seq_puts(s, "\tarray : 32 bits\n"); + trace_seq_putc(s, '\n'); + trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); - return ret; + return !trace_seq_has_overflowed(s); } /* @@ -419,32 +417,31 @@ static inline int test_time_stamp(u64 delta) int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; - int ret; - - ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" - "offset:0;\tsize:%u;\tsigned:%u;\n", - (unsigned int)sizeof(field.time_stamp), - (unsigned int)is_signed_type(u64)); - - ret = trace_seq_printf(s, "\tfield: local_t commit;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - (unsigned int)sizeof(field.commit), - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: int overwrite;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), commit), - 1, - (unsigned int)is_signed_type(long)); - - ret = trace_seq_printf(s, "\tfield: char data;\t" - "offset:%u;\tsize:%u;\tsigned:%u;\n", - (unsigned int)offsetof(typeof(field), data), - (unsigned int)BUF_PAGE_SIZE, - (unsigned int)is_signed_type(char)); - return ret; + trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return !trace_seq_has_overflowed(s); } struct rb_irq_work { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 92f4a6cee17..2e767972e99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -63,6 +63,10 @@ static bool __read_mostly tracing_selftest_running; */ bool __read_mostly tracing_selftest_disabled; +/* Pipe tracepoints to printk */ +struct trace_iterator *tracepoint_print_iter; +int tracepoint_printk; + /* For tracers that don't implement custom flags */ static struct tracer_opt dummy_tracer_opt[] = { { } @@ -155,10 +159,11 @@ __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); static int __init stop_trace_on_warning(char *str) { - __disable_trace_on_warning = 1; + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + __disable_trace_on_warning = 1; return 1; } -__setup("traceoff_on_warning=", stop_trace_on_warning); +__setup("traceoff_on_warning", stop_trace_on_warning); static int __init boot_alloc_snapshot(char *str) { @@ -192,6 +197,13 @@ static int __init set_trace_boot_clock(char *str) } __setup("trace_clock=", set_trace_boot_clock); +static int __init set_tracepoint_printk(char *str) +{ + if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0)) + tracepoint_printk = 1; + return 1; +} +__setup("tp_printk", set_tracepoint_printk); unsigned long long ns2usecs(cycle_t nsec) { @@ -938,19 +950,20 @@ out: return ret; } +/* TODO add a seq_buf_to_buffer() */ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) { int len; - if (s->len <= s->readpos) + if (trace_seq_used(s) <= s->seq.readpos) return -EBUSY; - len = s->len - s->readpos; + len = trace_seq_used(s) - s->seq.readpos; if (cnt > len) cnt = len; - memcpy(buf, s->buffer + s->readpos, cnt); + memcpy(buf, s->buffer + s->seq.readpos, cnt); - s->readpos += cnt; + s->seq.readpos += cnt; return cnt; } @@ -2029,7 +2042,7 @@ void trace_printk_init_buffers(void) pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); pr_warning("** **\n"); pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for produciton use. **\n"); + pr_warning("** unsafe for production use. **\n"); pr_warning("** **\n"); pr_warning("** If you see this message and you are not debugging **\n"); pr_warning("** the kernel, report this immediately to your vendor! **\n"); @@ -2158,9 +2171,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, goto out; } - len = vsnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); - if (len > TRACE_BUF_SIZE) - goto out; + len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); local_save_flags(flags); size = sizeof(*entry) + len + 1; @@ -2171,8 +2182,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, entry = ring_buffer_event_data(event); entry->ip = ip; - memcpy(&entry->buf, tbuffer, len); - entry->buf[len] = '\0'; + memcpy(&entry->buf, tbuffer, len + 1); if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); @@ -2509,14 +2519,14 @@ get_total_entries(struct trace_buffer *buf, static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n" + "# / _-----=> irqs-off \n" + "# | / _----=> need-resched \n" + "# || / _---=> hardirq/softirq \n" + "# ||| / _--=> preempt-depth \n" + "# |||| / delay \n" + "# cmd pid ||||| time | caller \n" + "# \\ / ||||| \\ | / \n"); } static void print_event_info(struct trace_buffer *buf, struct seq_file *m) @@ -2533,20 +2543,20 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m) static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n" + "# | | | | |\n"); } static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); - seq_puts(m, "# _-----=> irqs-off\n"); - seq_puts(m, "# / _----=> need-resched\n"); - seq_puts(m, "# | / _---=> hardirq/softirq\n"); - seq_puts(m, "# || / _--=> preempt-depth\n"); - seq_puts(m, "# ||| / delay\n"); - seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | |||| | |\n"); + seq_puts(m, "# _-----=> irqs-off\n" + "# / _----=> need-resched\n" + "# | / _---=> hardirq/softirq\n" + "# || / _--=> preempt-depth\n" + "# ||| / delay\n" + "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n" + "# | | | |||| | |\n"); } void @@ -2649,24 +2659,21 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (iter->iter_flags & TRACE_FILE_LAT_FMT) { - if (!trace_print_lat_context(iter)) - goto partial; - } else { - if (!trace_print_context(iter)) - goto partial; - } + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + trace_print_lat_context(iter); + else + trace_print_context(iter); } + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + if (event) return event->funcs->trace(iter, sym_flags, event); - if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) - goto partial; + trace_seq_printf(s, "Unknown type %d\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_raw_fmt(struct trace_iterator *iter) @@ -2677,22 +2684,20 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; - if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - if (!trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, iter->ts)) - goto partial; - } + if (trace_flags & TRACE_ITER_CONTEXT_INFO) + trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts); + + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; event = ftrace_find_event(entry->type); if (event) return event->funcs->raw(iter, 0, event); - if (!trace_seq_printf(s, "%d ?\n", entry->type)) - goto partial; + trace_seq_printf(s, "%d ?\n", entry->type); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t print_hex_fmt(struct trace_iterator *iter) @@ -2705,9 +2710,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_HEX_FIELD_RET(s, entry->pid); - SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + SEQ_PUT_HEX_FIELD(s, entry->pid); + SEQ_PUT_HEX_FIELD(s, iter->cpu); + SEQ_PUT_HEX_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2717,9 +2724,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) return ret; } - SEQ_PUT_FIELD_RET(s, newline); + SEQ_PUT_FIELD(s, newline); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t print_bin_fmt(struct trace_iterator *iter) @@ -2731,9 +2738,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) entry = iter->ent; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { - SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, iter->cpu); - SEQ_PUT_FIELD_RET(s, iter->ts); + SEQ_PUT_FIELD(s, entry->pid); + SEQ_PUT_FIELD(s, iter->cpu); + SEQ_PUT_FIELD(s, iter->ts); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; } event = ftrace_find_event(entry->type); @@ -2779,10 +2788,12 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; - if (iter->lost_events && - !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", - iter->cpu, iter->lost_events)) - return TRACE_TYPE_PARTIAL_LINE; + if (iter->lost_events) { + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + } if (iter->trace && iter->trace->print_line) { ret = iter->trace->print_line(iter); @@ -2860,44 +2871,44 @@ static void test_ftrace_alive(struct seq_file *m) { if (!ftrace_is_dead()) return; - seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); + seq_puts(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n" + "# MAY BE MISSING FUNCTION EVENTS\n"); } #ifdef CONFIG_TRACER_MAX_TRACE static void show_snapshot_main_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n" + "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer.\n" + "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void show_snapshot_percpu_help(struct seq_file *m) { - seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); + seq_puts(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); + seq_puts(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n" + "# Takes a snapshot of the main buffer for this cpu.\n"); #else - seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); - seq_printf(m, "# Must use main snapshot file to allocate.\n"); + seq_puts(m, "# echo 1 > snapshot : Not supported with this kernel.\n" + "# Must use main snapshot file to allocate.\n"); #endif - seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + seq_puts(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n" + "# (Doesn't have to be '2' works with any number that\n" + "# is not a '0' or '1')\n"); } static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->tr->allocated_snapshot) - seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + seq_puts(m, "#\n# * Snapshot is allocated *\n#\n"); else - seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + seq_puts(m, "#\n# * Snapshot is freed *\n#\n"); - seq_printf(m, "# Snapshot commands:\n"); + seq_puts(m, "# Snapshot commands:\n"); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) show_snapshot_main_help(m); else @@ -3251,7 +3262,7 @@ static int t_show(struct seq_file *m, void *v) if (!t) return 0; - seq_printf(m, "%s", t->name); + seq_puts(m, t->name); if (t->next) seq_putc(m, ' '); else @@ -4314,6 +4325,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) goto out; } + trace_seq_init(&iter->seq); + /* * We make a copy of the current tracer to avoid concurrent * changes on it while we are reading. @@ -4507,18 +4520,18 @@ waitagain: trace_access_lock(iter->cpu_file); while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; - int len = iter->seq.len; + int save_len = iter->seq.seq.len; ret = print_trace_line(iter); if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ - iter->seq.len = len; + iter->seq.seq.len = save_len; break; } if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); - if (iter->seq.len >= cnt) + if (trace_seq_used(&iter->seq) >= cnt) break; /* @@ -4534,7 +4547,7 @@ waitagain: /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); - if (iter->seq.readpos >= iter->seq.len) + if (iter->seq.seq.readpos >= trace_seq_used(&iter->seq)) trace_seq_init(&iter->seq); /* @@ -4568,20 +4581,33 @@ static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { size_t count; + int save_len; int ret; /* Seq buffer is page-sized, exactly what we need. */ for (;;) { - count = iter->seq.len; + save_len = iter->seq.seq.len; ret = print_trace_line(iter); - count = iter->seq.len - count; - if (rem < count) { - rem = 0; - iter->seq.len -= count; + + if (trace_seq_has_overflowed(&iter->seq)) { + iter->seq.seq.len = save_len; break; } + + /* + * This should not be hit, because it should only + * be set if the iter->seq overflowed. But check it + * anyway to be safe. + */ if (ret == TRACE_TYPE_PARTIAL_LINE) { - iter->seq.len -= count; + iter->seq.seq.len = save_len; + break; + } + + count = trace_seq_used(&iter->seq) - save_len; + if (rem < count) { + rem = 0; + iter->seq.seq.len = save_len; break; } @@ -4662,13 +4688,13 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* Copy the data into the page, so we can start over. */ ret = trace_seq_to_buffer(&iter->seq, page_address(spd.pages[i]), - iter->seq.len); + trace_seq_used(&iter->seq)); if (ret < 0) { __free_page(spd.pages[i]); break; } spd.partial[i].offset = 0; - spd.partial[i].len = iter->seq.len; + spd.partial[i].len = trace_seq_used(&iter->seq); trace_seq_init(&iter->seq); } @@ -5668,7 +5694,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); - count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + count = simple_read_from_buffer(ubuf, count, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -5749,10 +5776,10 @@ ftrace_snapshot_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%ps:", (void *)ip); - seq_printf(m, "snapshot"); + seq_puts(m, "snapshot"); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); @@ -6417,7 +6444,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m int ret; /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (WARN_ON_ONCE(parent != trace_instance_dir)) return -ENOENT; @@ -6444,7 +6471,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) int ret; /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); if (WARN_ON_ONCE(parent != trace_instance_dir)) return -ENOENT; @@ -6631,11 +6658,19 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= TRACE_MAX_PRINT) - s->len = TRACE_MAX_PRINT; + if (s->seq.len >= TRACE_MAX_PRINT) + s->seq.len = TRACE_MAX_PRINT; + + /* + * More paranoid code. Although the buffer size is set to + * PAGE_SIZE, and TRACE_MAX_PRINT is 1000, this is just + * an extra layer of protection. + */ + if (WARN_ON_ONCE(s->seq.len >= s->seq.size)) + s->seq.len = s->seq.size - 1; /* should be zero ended, but we are paranoid. */ - s->buffer[s->len] = 0; + s->buffer[s->seq.len] = 0; printk(KERN_TRACE "%s", s->buffer); @@ -6874,6 +6909,19 @@ out: return ret; } +void __init trace_init(void) +{ + if (tracepoint_printk) { + tracepoint_print_iter = + kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL); + if (WARN_ON(!tracepoint_print_iter)) + tracepoint_printk = 0; + } + tracer_alloc_buffers(); + init_ftrace_syscalls(); + trace_event_init(); +} + __init static int clear_boot_tracer(void) { /* @@ -6893,6 +6941,5 @@ __init static int clear_boot_tracer(void) return 0; } -early_initcall(tracer_alloc_buffers); fs_initcall(tracer_init_debugfs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 385391fb1d3..8de48bac1ce 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -14,6 +14,7 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> #include <linux/compiler.h> +#include <linux/trace_seq.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ @@ -569,15 +570,6 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); -void tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc); - -void tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *cur, - unsigned long flags, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -597,9 +589,6 @@ void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); -void tracing_sched_switch_assign_trace(struct trace_array *tr); -void tracing_stop_sched_switch_record(void); -void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); @@ -719,6 +708,8 @@ enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; +extern char trace_find_mark(unsigned long long duration); + /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -737,7 +728,7 @@ extern unsigned long trace_flags; extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); -extern enum print_line_t +extern void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); @@ -1310,4 +1301,18 @@ int perf_ftrace_event_register(struct ftrace_event_call *call, #define perf_ftrace_event_register NULL #endif +#ifdef CONFIG_FTRACE_SYSCALLS +void init_ftrace_syscalls(void); +#else +static inline void init_ftrace_syscalls(void) { } +#endif + +#ifdef CONFIG_EVENT_TRACING +void trace_event_init(void); +#else +static inline void __init trace_event_init(void) { } +#endif + +extern struct trace_iterator *tracepoint_print_iter; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f..7d6e2afde66 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -151,22 +151,21 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", - field->correct ? " ok " : " MISS ", - field->func, - field->file, - field->line)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line); + + return trace_handle_return(&iter->seq); } static void branch_print_header(struct seq_file *s) { seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" - " FUNC:FILE:LINE\n"); - seq_puts(s, "# | | | | | " - " |\n"); + " FUNC:FILE:LINE\n" + "# | | | | | " + " |\n"); } static struct trace_event_functions trace_branch_funcs = { @@ -233,12 +232,12 @@ extern unsigned long __stop_annotated_branch_profile[]; static int annotated_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " correct incorrect %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " correct incorrect % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } @@ -274,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v) seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); if (percent < 0) - seq_printf(m, " X "); + seq_puts(m, " X "); else seq_printf(m, "%3ld ", percent); seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); @@ -362,12 +361,12 @@ extern unsigned long __stop_branch_profile[]; static int all_branch_stat_headers(struct seq_file *m) { - seq_printf(m, " miss hit %% "); - seq_printf(m, " Function " - " File Line\n" - " ------- --------- - " - " -------- " - " ---- ----\n"); + seq_puts(m, " miss hit % " + " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); return 0; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0cc51edde3a..366a78a3e61 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -212,8 +212,40 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer, } EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve); +static DEFINE_SPINLOCK(tracepoint_iter_lock); + +static void output_printk(struct ftrace_event_buffer *fbuffer) +{ + struct ftrace_event_call *event_call; + struct trace_event *event; + unsigned long flags; + struct trace_iterator *iter = tracepoint_print_iter; + + if (!iter) + return; + + event_call = fbuffer->ftrace_file->event_call; + if (!event_call || !event_call->event.funcs || + !event_call->event.funcs->trace) + return; + + event = &fbuffer->ftrace_file->event_call->event; + + spin_lock_irqsave(&tracepoint_iter_lock, flags); + trace_seq_init(&iter->seq); + iter->ent = fbuffer->entry; + event_call->event.funcs->trace(iter, 0, event); + trace_seq_putc(&iter->seq, 0); + printk("%s", iter->seq.buffer); + + spin_unlock_irqrestore(&tracepoint_iter_lock, flags); +} + void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer) { + if (tracepoint_printk) + output_printk(fbuffer); + event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer, fbuffer->event, fbuffer->entry, fbuffer->flags, fbuffer->pc); @@ -461,7 +493,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) if (dir) { spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &dir->d_subdirs, d_child) { if (child->d_inode) /* probably unneeded */ child->d_inode->i_private = NULL; } @@ -918,7 +950,7 @@ static int f_show(struct seq_file *m, void *v) case FORMAT_HEADER: seq_printf(m, "name: %s\n", ftrace_event_name(call)); seq_printf(m, "ID: %d\n", call->event.type); - seq_printf(m, "format:\n"); + seq_puts(m, "format:\n"); return 0; case FORMAT_FIELD_SEPERATOR: @@ -1044,7 +1076,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_unlock(&event_mutex); if (file) - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1210,7 +1243,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); print_subsystem_event_filter(system, s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1265,7 +1299,8 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) trace_seq_init(s); func(s); - r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, trace_seq_used(s)); kfree(s); @@ -1988,7 +2023,7 @@ event_enable_print(struct seq_file *m, unsigned long ip, ftrace_event_name(data->file->event_call)); if (data->count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", data->count); @@ -2477,8 +2512,14 @@ static __init int event_trace_init(void) #endif return 0; } -early_initcall(event_trace_memsetup); -core_initcall(event_trace_enable); + +void __init trace_event_init(void) +{ + event_trace_memsetup(); + init_ftrace_syscalls(); + event_trace_enable(); +} + fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7a8c1528e14..ced69da0ff5 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -45,6 +45,7 @@ enum filter_op_ids OP_GT, OP_GE, OP_BAND, + OP_NOT, OP_NONE, OP_OPEN_PAREN, }; @@ -67,6 +68,7 @@ static struct filter_op filter_ops[] = { { OP_GT, ">", 5 }, { OP_GE, ">=", 5 }, { OP_BAND, "&", 6 }, + { OP_NOT, "!", 6 }, { OP_NONE, "OP_NONE", 0 }, { OP_OPEN_PAREN, "(", 0 }, }; @@ -85,6 +87,7 @@ enum { FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, FILT_ERR_IP_FIELD_ONLY, + FILT_ERR_ILLEGAL_NOT_OP, }; static char *err_text[] = { @@ -101,6 +104,7 @@ static char *err_text[] = { "Missing field name and/or value", "Meaningless filter expression", "Only 'ip' field is supported for function trace", + "Illegal use of '!'", }; struct opstack_op { @@ -139,6 +143,7 @@ struct pred_stack { int index; }; +/* If not of not match is equal to not of not, then it is a match */ #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -166,7 +171,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \ break; \ } \ \ - return match; \ + return !!match == !pred->not; \ } #define DEFINE_EQUALITY_PRED(size) \ @@ -484,9 +489,10 @@ static int process_ops(struct filter_pred *preds, if (!WARN_ON_ONCE(!pred->fn)) match = pred->fn(pred, rec); if (!!match == type) - return match; + break; } - return match; + /* If not of not match is equal to not of not, then it is a match */ + return !!match == !op->not; } struct filter_match_preds_data { @@ -735,10 +741,10 @@ static int filter_set_pred(struct event_filter *filter, * then this op can be folded. */ if (left->index & FILTER_PRED_FOLD && - (left->op == dest->op || + ((left->op == dest->op && !left->not) || left->left == FILTER_PRED_INVALID) && right->index & FILTER_PRED_FOLD && - (right->op == dest->op || + ((right->op == dest->op && !right->not) || right->left == FILTER_PRED_INVALID)) dest->index |= FILTER_PRED_FOLD; @@ -1028,7 +1034,7 @@ static int init_pred(struct filter_parse_state *ps, } if (pred->op == OP_NE) - pred->not = 1; + pred->not ^= 1; pred->fn = fn; return 0; @@ -1590,6 +1596,17 @@ static int replace_preds(struct ftrace_event_call *call, continue; } + if (elt->op == OP_NOT) { + if (!n_preds || operand1 || operand2) { + parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0); + err = -EINVAL; + goto fail; + } + if (!dry_run) + filter->preds[n_preds - 1].not ^= 1; + continue; + } + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); err = -ENOSPC; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 4747b476a03..8712df9decb 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -373,7 +373,7 @@ event_trigger_print(const char *name, struct seq_file *m, { long count = (long)data; - seq_printf(m, "%s", name); + seq_puts(m, name); if (count == -1) seq_puts(m, ":unlimited"); @@ -383,7 +383,7 @@ event_trigger_print(const char *name, struct seq_file *m, if (filter_str) seq_printf(m, " if %s\n", filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1105,7 +1105,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, if (data->filter_str) seq_printf(m, " if %s\n", data->filter_str); else - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 57f0ec962d2..fcd41a16640 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -261,37 +261,74 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static int update_count(void **data) +static void update_traceon_count(void **data, bool on) { - unsigned long *count = (long *)data; + long *count = (long *)data; + long old_count = *count; - if (!*count) - return 0; + /* + * Tracing gets disabled (or enabled) once per count. + * This function can be called at the same time on multiple CPUs. + * It is fine if both disable (or enable) tracing, as disabling + * (or enabling) the second time doesn't do anything as the + * state of the tracer is already disabled (or enabled). + * What needs to be synchronized in this case is that the count + * only gets decremented once, even if the tracer is disabled + * (or enabled) twice, as the second one is really a nop. + * + * The memory barriers guarantee that we only decrement the + * counter once. First the count is read to a local variable + * and a read barrier is used to make sure that it is loaded + * before checking if the tracer is in the state we want. + * If the tracer is not in the state we want, then the count + * is guaranteed to be the old count. + * + * Next the tracer is set to the state we want (disabled or enabled) + * then a write memory barrier is used to make sure that + * the new state is visible before changing the counter by + * one minus the old counter. This guarantees that another CPU + * executing this code will see the new state before seeing + * the new counter value, and would not do anything if the new + * counter is seen. + * + * Note, there is no synchronization between this and a user + * setting the tracing_on file. But we currently don't care + * about that. + */ + if (!old_count) + return; - if (*count != -1) - (*count)--; + /* Make sure we see count before checking tracing state */ + smp_rmb(); - return 1; + if (on == !!tracing_is_on()) + return; + + if (on) + tracing_on(); + else + tracing_off(); + + /* unlimited? */ + if (old_count == -1) + return; + + /* Make sure tracing state is visible before updating count */ + smp_wmb(); + + *count = old_count - 1; } static void ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (tracing_is_on()) - return; - - if (update_count(data)) - tracing_on(); + update_traceon_count(data, 1); } static void ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; - - if (update_count(data)) - tracing_off(); + update_traceon_count(data, 0); } static void @@ -330,11 +367,49 @@ ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) static void ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) { - if (!tracing_is_on()) - return; + long *count = (long *)data; + long old_count; + long new_count; - if (update_count(data)) - trace_dump_stack(STACK_SKIP); + /* + * Stack traces should only execute the number of times the + * user specified in the counter. + */ + do { + + if (!tracing_is_on()) + return; + + old_count = *count; + + if (!old_count) + return; + + /* unlimited? */ + if (old_count == -1) { + trace_dump_stack(STACK_SKIP); + return; + } + + new_count = old_count - 1; + new_count = cmpxchg(count, old_count, new_count); + if (new_count == old_count) + trace_dump_stack(STACK_SKIP); + + } while (new_count != old_count); +} + +static int update_count(void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return 0; + + if (*count != -1) + (*count)--; + + return 1; } static void @@ -361,7 +436,7 @@ ftrace_probe_print(const char *name, struct seq_file *m, seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) - seq_printf(m, ":unlimited\n"); + seq_puts(m, ":unlimited\n"); else seq_printf(m, ":count=%ld\n", count); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f0a0c982cde..ba476009e5d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -107,7 +107,7 @@ enum { FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags); @@ -483,33 +483,24 @@ static int graph_trace_update_thresh(struct trace_array *tr) static int max_bytes_for_cpu; -static enum print_line_t -print_graph_cpu(struct trace_seq *s, int cpu) +static void print_graph_cpu(struct trace_seq *s, int cpu) { - int ret; - /* * Start with a space character - to make it stand out * to the right a bit when trace output is pasted into * email: */ - ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); } #define TRACE_GRAPH_PROCINFO_LENGTH 14 -static enum print_line_t -print_graph_proc(struct trace_seq *s, pid_t pid) +static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ char pid_str[11]; int spaces = 0; - int ret; int len; int i; @@ -524,56 +515,43 @@ print_graph_proc(struct trace_seq *s, pid_t pid) spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; /* First spaces to align center */ - for (i = 0; i < spaces / 2; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < spaces / 2; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%s-%s", comm, pid_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s-%s", comm, pid_str); /* Last spaces to align center */ - for (i = 0; i < spaces - (spaces / 2); i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = 0; i < spaces - (spaces / 2); i++) + trace_seq_putc(s, ' '); } -static enum print_line_t -print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - if (!trace_seq_putc(s, ' ')) - return 0; - - return trace_print_lat_fmt(s, entry); + trace_seq_putc(s, ' '); + trace_print_lat_fmt(s, entry); } /* If the pid changed since the last trace, output this event */ -static enum print_line_t +static void verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) { pid_t prev_pid; pid_t *last_pid; - int ret; if (!data) - return TRACE_TYPE_HANDLED; + return; last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); if (*last_pid == pid) - return TRACE_TYPE_HANDLED; + return; prev_pid = *last_pid; *last_pid = pid; if (prev_pid == -1) - return TRACE_TYPE_HANDLED; + return; /* * Context-switch trace line: @@ -582,33 +560,12 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) ------------------------------------------ */ - ret = trace_seq_puts(s, - " ------------------------------------------\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, prev_pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " => "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, - "\n ------------------------------------------\n\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " ------------------------------------------\n"); + print_graph_cpu(s, cpu); + print_graph_proc(s, prev_pid); + trace_seq_puts(s, " => "); + print_graph_proc(s, pid); + trace_seq_puts(s, "\n ------------------------------------------\n\n"); } static struct ftrace_graph_ret_entry * @@ -682,175 +639,122 @@ get_return_for_leaf(struct trace_iterator *iter, return next; } -static int print_graph_abs_time(u64 t, struct trace_seq *s) +static void print_graph_abs_time(u64 t, struct trace_seq *s) { unsigned long usecs_rem; usecs_rem = do_div(t, NSEC_PER_SEC); usecs_rem /= 1000; - return trace_seq_printf(s, "%5lu.%06lu | ", - (unsigned long)t, usecs_rem); + trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); } -static enum print_line_t +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { - int ret; struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; if (addr < (unsigned long)__irqentry_text_start || addr >= (unsigned long)__irqentry_text_end) - return TRACE_TYPE_UNHANDLED; + return; if (trace_flags & TRACE_ITER_CONTEXT_INFO) { /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, pid); + trace_seq_puts(s, " | "); } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); } /* No overhead */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_START); if (type == TRACE_GRAPH_ENT) - ret = trace_seq_puts(s, "==========>"); + trace_seq_puts(s, "==========>"); else - ret = trace_seq_puts(s, "<=========="); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_putc(s, '\n'); + trace_seq_puts(s, "<=========="); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + print_graph_duration(0, s, flags | FLAGS_FILL_END); + trace_seq_putc(s, '\n'); } -enum print_line_t +void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ - char msecs_str[21]; + char usecs_str[21]; char nsecs_str[5]; - int ret, len; + int len; int i; - sprintf(msecs_str, "%lu", (unsigned long) duration); + sprintf(usecs_str, "%lu", (unsigned long) duration); /* Print msecs */ - ret = trace_seq_printf(s, "%s", msecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s", usecs_str); - len = strlen(msecs_str); + len = strlen(usecs_str); /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); snprintf(nsecs_str, slen, "%03lu", nsecs_rem); - ret = trace_seq_printf(s, ".%s", nsecs_str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, ".%s", nsecs_str); len += strlen(nsecs_str); } - ret = trace_seq_puts(s, " us "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " us "); /* Print remaining spaces to fit the row's width */ - for (i = len; i < 7; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } - return TRACE_TYPE_HANDLED; + for (i = len; i < 7; i++) + trace_seq_putc(s, ' '); } -static enum print_line_t +static void print_graph_duration(unsigned long long duration, struct trace_seq *s, u32 flags) { - int ret = -1; - if (!(flags & TRACE_GRAPH_PRINT_DURATION) || !(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return TRACE_TYPE_HANDLED; + return; /* No real adata, just filling the column with spaces */ switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { case FLAGS_FILL_FULL: - ret = trace_seq_puts(s, " | "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " | "); + return; case FLAGS_FILL_START: - ret = trace_seq_puts(s, " "); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " "); + return; case FLAGS_FILL_END: - ret = trace_seq_puts(s, " |"); - return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, " |"); + return; } /* Signal a overhead of time execution to the output */ - if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { - /* Duration exceeded 100 msecs */ - if (duration > 100000ULL) - ret = trace_seq_puts(s, "! "); - /* Duration exceeded 10 msecs */ - else if (duration > 10000ULL) - ret = trace_seq_puts(s, "+ "); - } - - /* - * The -1 means we either did not exceed the duration tresholds - * or we dont want to print out the overhead. Either way we need - * to fill out the space. - */ - if (ret == -1) - ret = trace_seq_puts(s, " "); - - /* Catching here any failure happenned above */ - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_print_graph_duration(duration, s); - if (ret != TRACE_TYPE_HANDLED) - return ret; - - ret = trace_seq_puts(s, "| "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) + trace_seq_printf(s, "%c ", trace_find_mark(duration)); + else + trace_seq_puts(s, " "); - return TRACE_TYPE_HANDLED; + trace_print_graph_duration(duration, s); + trace_seq_puts(s, "| "); } /* Case of a leaf function on its call entry */ @@ -864,7 +768,6 @@ print_graph_entry_leaf(struct trace_iterator *iter, struct ftrace_graph_ret *graph_ret; struct ftrace_graph_ent *call; unsigned long long duration; - int ret; int i; graph_ret = &ret_entry->ret; @@ -890,22 +793,15 @@ print_graph_entry_leaf(struct trace_iterator *iter, } /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); - ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%ps();\n", (void *)call->func); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -915,7 +811,6 @@ print_graph_entry_nested(struct trace_iterator *iter, { struct ftrace_graph_ent *call = &entry->graph_ent; struct fgraph_data *data = iter->private; - int ret; int i; if (data) { @@ -931,19 +826,15 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Function */ - for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); + + trace_seq_printf(s, "%ps() {\n", (void *)call->func); - ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); - if (!ret) + if (trace_seq_has_overflowed(s)) return TRACE_TYPE_PARTIAL_LINE; /* @@ -953,62 +844,43 @@ print_graph_entry_nested(struct trace_iterator *iter, return TRACE_TYPE_NO_CONSUME; } -static enum print_line_t +static void print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, int type, unsigned long addr, u32 flags) { struct fgraph_data *data = iter->private; struct trace_entry *ent = iter->ent; int cpu = iter->cpu; - int ret; /* Pid */ - if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + verif_pid(s, ent->pid, cpu, data); - if (type) { + if (type) /* Interrupt */ - ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + print_graph_irq(iter, addr, type, cpu, ent->pid, flags); if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) - return 0; + return; /* Absolute time */ - if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { - ret = print_graph_abs_time(iter->ts, s); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + print_graph_abs_time(iter->ts, s); /* Cpu */ - if (flags & TRACE_GRAPH_PRINT_CPU) { - ret = print_graph_cpu(s, cpu); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_CPU) + print_graph_cpu(s, cpu); /* Proc */ if (flags & TRACE_GRAPH_PRINT_PROC) { - ret = print_graph_proc(s, ent->pid); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_puts(s, " | "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_proc(s, ent->pid); + trace_seq_puts(s, " | "); } /* Latency format */ - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - ret = print_graph_lat_fmt(s, ent); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_LATENCY_FMT) + print_graph_lat_fmt(s, ent); - return 0; + return; } /* @@ -1126,8 +998,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, if (check_irq_entry(iter, flags, call->func, call->depth)) return TRACE_TYPE_HANDLED; - if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags); leaf_ret = get_return_for_leaf(iter, field); if (leaf_ret) @@ -1160,7 +1031,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, pid_t pid = ent->pid; int cpu = iter->cpu; int func_match = 1; - int ret; int i; if (check_irq_return(iter, flags, trace->depth)) @@ -1186,20 +1056,14 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, } } - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* Overhead and duration */ - ret = print_graph_duration(duration, s, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_duration(duration, s, flags); /* Closing brace */ - for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* * If the return function does not have a matching entry, @@ -1208,30 +1072,20 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, * belongs to, write out the function name. Always do * that if the funcgraph-tail option is enabled. */ - if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) { - ret = trace_seq_puts(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } else { - ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) + trace_seq_puts(s, "}\n"); + else + trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); /* Overrun */ - if (flags & TRACE_GRAPH_PRINT_OVERRUN) { - ret = trace_seq_printf(s, " (Overruns: %lu)\n", - trace->overrun); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (flags & TRACE_GRAPH_PRINT_OVERRUN) + trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); - ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, - cpu, pid, flags); - if (ret == TRACE_TYPE_PARTIAL_LINE) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -1248,26 +1102,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (data) depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; - if (print_graph_prologue(iter, s, 0, 0, flags)) - return TRACE_TYPE_PARTIAL_LINE; + print_graph_prologue(iter, s, 0, 0, flags); /* No time */ - ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); - if (ret != TRACE_TYPE_HANDLED) - return ret; + print_graph_duration(0, s, flags | FLAGS_FILL_FULL); /* Indentation */ if (depth > 0) - for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { - ret = trace_seq_putc(s, ' '); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) + trace_seq_putc(s, ' '); /* The comment */ - ret = trace_seq_puts(s, "/* "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, "/* "); switch (iter->ent->type) { case TRACE_BPRINT: @@ -1290,17 +1136,18 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return ret; } + if (trace_seq_has_overflowed(s)) + goto out; + /* Strip ending newline */ - if (s->buffer[s->len - 1] == '\n') { - s->buffer[s->len - 1] = '\0'; - s->len--; + if (s->buffer[s->seq.len - 1] == '\n') { + s->buffer[s->seq.len - 1] = '\0'; + s->seq.len--; } - ret = trace_seq_puts(s, " */\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " */\n"); + out: + return trace_handle_return(s); } @@ -1407,32 +1254,32 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) print_lat_header(s, flags); /* 1st line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " TIME "); + seq_puts(s, " TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " CPU"); + seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " TASK/PID "); + seq_puts(s, " TASK/PID "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " DURATION "); - seq_printf(s, " FUNCTION CALLS\n"); + seq_puts(s, " DURATION "); + seq_puts(s, " FUNCTION CALLS\n"); /* 2nd line */ - seq_printf(s, "#"); + seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) - seq_printf(s, " | "); + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) - seq_printf(s, " | | "); + seq_puts(s, " | | "); if (lat) - seq_printf(s, "||||"); + seq_puts(s, "||||"); if (flags & TRACE_GRAPH_PRINT_DURATION) - seq_printf(s, " | | "); - seq_printf(s, " | | | |\n"); + seq_puts(s, " | | "); + seq_puts(s, " | | | |\n"); } static void print_graph_headers(struct seq_file *s) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index bd90e1b0608..b0b1c44e923 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -20,10 +20,12 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; unsigned int old_userobj; int cnt = 0, cpu; trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); @@ -57,19 +59,19 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } - if (!trace_empty(&iter)) - trace_find_next_entry_inc(&iter); - while (!trace_empty(&iter)) { + + while (trace_find_next_entry_inc(&iter)) { if (!cnt) kdb_printf("---------------------------------\n"); cnt++; - if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + if (!skip_lines) { print_trace_line(&iter); - if (!skip_lines) trace_printk_seq(&iter.seq); - else + } else { skip_lines--; + } + if (KDB_FLAG(CMD_INTERRUPT)) goto out; } @@ -86,9 +88,12 @@ out: atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - for_each_tracing_cpu(cpu) - if (iter.buffer_iter[cpu]) + for_each_tracing_cpu(cpu) { + if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); + iter.buffer_iter[cpu] = NULL; + } + } } /* diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 282f6e4e553..5edb518be34 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -826,7 +826,7 @@ static int probes_seq_show(struct seq_file *m, void *v) struct trace_kprobe *tk = v; int i; - seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p'); + seq_putc(m, trace_kprobe_is_return(tk) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tk->tp.call.class->system, ftrace_event_name(&tk->tp.call)); @@ -840,7 +840,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tk->tp.nr_args; i++) seq_printf(m, " %s=%s", tk->tp.args[i].name, tk->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -1024,27 +1024,22 @@ print_kprobe_event(struct trace_iterator *iter, int flags, field = (struct kprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; - - if (!trace_seq_puts(s, "\n")) - goto partial; + goto out; - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); + out: + return trace_handle_return(s); } static enum print_line_t @@ -1060,33 +1055,28 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, field = (struct kretprobe_trace_entry_head *)iter->ent; tp = container_of(event, struct trace_probe, call.event); - if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call))) - goto partial; + trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)); if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, " <- ")) - goto partial; + trace_seq_puts(s, " <- "); if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) - goto partial; + goto out; - if (!trace_seq_puts(s, ")")) - goto partial; + trace_seq_putc(s, ')'); data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, data + tp->args[i].offset, field)) - goto partial; + goto out; - if (!trace_seq_puts(s, "\n")) - goto partial; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 0abd9b86347..7a9ba62e9fe 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -59,17 +59,15 @@ static void mmio_trace_start(struct trace_array *tr) mmio_reset_data(tr); } -static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +static void mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) { - int ret = 0; int i; resource_size_t start, end; const struct pci_driver *drv = pci_dev_driver(dev); - /* XXX: incomplete checks for trace_seq_printf() return value */ - ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", - dev->bus->number, dev->devfn, - dev->vendor, dev->device, dev->irq); + trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); /* * XXX: is pci_resource_to_user() appropriate, since we are * supposed to interpret the __ioremap() phys_addr argument based on @@ -77,21 +75,20 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) */ for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", (unsigned long long)(start | (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); } for (i = 0; i < 7; i++) { pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); - ret += trace_seq_printf(s, " %llx", + trace_seq_printf(s, " %llx", dev->resource[i].start < dev->resource[i].end ? (unsigned long long)(end - start) + 1 : 0); } if (drv) - ret += trace_seq_printf(s, " %s\n", drv->name); + trace_seq_printf(s, " %s\n", drv->name); else - ret += trace_seq_puts(s, " \n"); - return ret; + trace_seq_puts(s, " \n"); } static void destroy_header_iter(struct header_iter *hiter) @@ -179,28 +176,27 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret = 1; trace_assign_type(field, entry); rw = &field->rw; switch (rw->opcode) { case MMIO_READ: - ret = trace_seq_printf(s, + trace_seq_printf(s, "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_WRITE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", rw->width, secs, usec_rem, rw->map_id, (unsigned long long)rw->phys, rw->value, rw->pc, 0); break; case MMIO_UNKNOWN_OP: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," "%02lx 0x%lx %d\n", secs, usec_rem, rw->map_id, @@ -209,12 +205,11 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) (rw->value >> 0) & 0xff, rw->pc, 0); break; default: - ret = trace_seq_puts(s, "rw what?\n"); + trace_seq_puts(s, "rw what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_map(struct trace_iterator *iter) @@ -226,31 +221,29 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; trace_assign_type(field, entry); m = &field->map; switch (m->opcode) { case MMIO_PROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", secs, usec_rem, m->map_id, (unsigned long long)m->phys, m->virt, m->len, 0UL, 0); break; case MMIO_UNPROBE: - ret = trace_seq_printf(s, + trace_seq_printf(s, "UNMAP %u.%06lu %d 0x%lx %d\n", secs, usec_rem, m->map_id, 0UL, 0); break; default: - ret = trace_seq_puts(s, "map what?\n"); + trace_seq_puts(s, "map what?\n"); break; } - if (ret) - return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE; + + return trace_handle_return(s); } static enum print_line_t mmio_print_mark(struct trace_iterator *iter) @@ -262,14 +255,11 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter) unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; - int ret; /* The trailing newline must be in the message. */ - ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t mmio_print_line(struct trace_iterator *iter) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c6977d5a9b1..b77b9a69761 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -25,15 +25,12 @@ enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bputs_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, field->str); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) @@ -41,15 +38,12 @@ enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct bprint_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_bprintf(s, field->fmt, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) @@ -57,15 +51,12 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; struct print_entry *field; - int ret; trace_assign_type(field, entry); - ret = trace_seq_puts(s, field->buf); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_puts(s, field->buf); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } const char * @@ -124,7 +115,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, if (ret == (const char *)(trace_seq_buffer_ptr(p))) trace_seq_printf(p, "0x%lx", val); - + trace_seq_putc(p, 0); return ret; @@ -193,7 +184,6 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct trace_seq *p = &iter->tmp_seq; struct trace_entry *entry; - int ret; event = container_of(trace_event, struct ftrace_event_call, event); entry = iter->ent; @@ -204,11 +194,9 @@ int ftrace_raw_output_prep(struct trace_iterator *iter, } trace_seq_init(p); - ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", ftrace_event_name(event)); - return 0; + return trace_handle_return(s); } EXPORT_SYMBOL(ftrace_raw_output_prep); @@ -216,18 +204,11 @@ static int ftrace_output_raw(struct trace_iterator *iter, char *name, char *fmt, va_list ap) { struct trace_seq *s = &iter->seq; - int ret; - - ret = trace_seq_printf(s, "%s: ", name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - ret = trace_seq_vprintf(s, fmt, ap); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: ", name); + trace_seq_vprintf(s, fmt, ap); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) @@ -260,7 +241,7 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static int +static void seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) { #ifdef CONFIG_KALLSYMS @@ -271,12 +252,11 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } -static int +static void seq_print_sym_offset(struct trace_seq *s, const char *fmt, unsigned long address) { @@ -287,9 +267,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt, sprint_symbol(str, address); name = kretprobed(str); - return trace_seq_printf(s, fmt, name); + trace_seq_printf(s, fmt, name); #endif - return 1; } #ifndef CONFIG_64BIT @@ -320,14 +299,14 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, if (file) { ret = trace_seq_path(s, &file->f_path); if (ret) - ret = trace_seq_printf(s, "[+0x%lx]", - ip - vmstart); + trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); } up_read(&mm->mmap_sem); } if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + return !trace_seq_has_overflowed(s); } int @@ -335,7 +314,6 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags) { struct mm_struct *mm = NULL; - int ret = 1; unsigned int i; if (trace_flags & TRACE_ITER_SYM_USEROBJ) { @@ -354,48 +332,45 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { unsigned long ip = entry->caller[i]; - if (ip == ULONG_MAX || !ret) + if (ip == ULONG_MAX || trace_seq_has_overflowed(s)) break; - if (ret) - ret = trace_seq_puts(s, " => "); + + trace_seq_puts(s, " => "); + if (!ip) { - if (ret) - ret = trace_seq_puts(s, "??"); - if (ret) - ret = trace_seq_putc(s, '\n'); + trace_seq_puts(s, "??"); + trace_seq_putc(s, '\n'); continue; } - if (!ret) - break; - if (ret) - ret = seq_print_user_ip(s, mm, ip, sym_flags); - ret = trace_seq_putc(s, '\n'); + + seq_print_user_ip(s, mm, ip, sym_flags); + trace_seq_putc(s, '\n'); } if (mm) mmput(mm); - return ret; + + return !trace_seq_has_overflowed(s); } int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) { - int ret; - - if (!ip) - return trace_seq_putc(s, '0'); + if (!ip) { + trace_seq_putc(s, '0'); + goto out; + } if (sym_flags & TRACE_ITER_SYM_OFFSET) - ret = seq_print_sym_offset(s, "%s", ip); + seq_print_sym_offset(s, "%s", ip); else - ret = seq_print_sym_short(s, "%s", ip); - - if (!ret) - return 0; + seq_print_sym_short(s, "%s", ip); if (sym_flags & TRACE_ITER_SYM_ADDR) - ret = trace_seq_printf(s, " <" IP_FMT ">", ip); - return ret; + trace_seq_printf(s, " <" IP_FMT ">", ip); + + out: + return !trace_seq_has_overflowed(s); } /** @@ -413,7 +388,6 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; - int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -445,16 +419,15 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.'; - if (!trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq)) - return 0; + trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq); if (entry->preempt_count) - ret = trace_seq_printf(s, "%x", entry->preempt_count); + trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_putc(s, '.'); + trace_seq_putc(s, '.'); - return ret; + return !trace_seq_has_overflowed(s); } static int @@ -464,14 +437,38 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) trace_find_cmdline(entry->pid, comm); - if (!trace_seq_printf(s, "%8.8s-%-5d %3d", - comm, entry->pid, cpu)) - return 0; + trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu); return trace_print_lat_fmt(s, entry); } -static unsigned long preempt_mark_thresh_us = 100; +#undef MARK +#define MARK(v, s) {.val = v, .sym = s} +/* trace overhead mark */ +static const struct trace_mark { + unsigned long long val; /* unit: nsec */ + char sym; +} mark[] = { + MARK(1000000000ULL , '$'), /* 1 sec */ + MARK(1000000ULL , '#'), /* 1000 usecs */ + MARK(100000ULL , '!'), /* 100 usecs */ + MARK(10000ULL , '+'), /* 10 usecs */ +}; +#undef MARK + +char trace_find_mark(unsigned long long d) +{ + int i; + int size = ARRAY_SIZE(mark); + + for (i = 0; i < size; i++) { + if (d >= mark[i].val) + break; + } + + return (i == size) ? ' ' : mark[i].sym; +} static int lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) @@ -493,24 +490,28 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) unsigned long rel_usec = do_div(rel_ts, USEC_PER_MSEC); unsigned long rel_msec = (unsigned long)rel_ts; - return trace_seq_printf( - s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", - ns2usecs(iter->ts), - abs_msec, abs_usec, - rel_msec, rel_usec); + trace_seq_printf( + s, "[%08llx] %ld.%03ldms (+%ld.%03ldms): ", + ns2usecs(iter->ts), + abs_msec, abs_usec, + rel_msec, rel_usec); + } else if (verbose && !in_ns) { - return trace_seq_printf( - s, "[%016llx] %lld (+%lld): ", - iter->ts, abs_ts, rel_ts); + trace_seq_printf( + s, "[%016llx] %lld (+%lld): ", + iter->ts, abs_ts, rel_ts); + } else if (!verbose && in_ns) { - return trace_seq_printf( - s, " %4lldus%c: ", - abs_ts, - rel_ts > preempt_mark_thresh_us ? '!' : - rel_ts > 1 ? '+' : ' '); + trace_seq_printf( + s, " %4lldus%c: ", + abs_ts, + trace_find_mark(rel_ts * NSEC_PER_USEC)); + } else { /* !verbose && !in_ns */ - return trace_seq_printf(s, " %4lld: ", abs_ts); + trace_seq_printf(s, " %4lld: ", abs_ts); } + + return !trace_seq_has_overflowed(s); } int trace_print_context(struct trace_iterator *iter) @@ -520,34 +521,29 @@ int trace_print_context(struct trace_iterator *iter) unsigned long long t; unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; - int ret; trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + trace_seq_printf(s, "%16s-%-5d [%03d] ", comm, entry->pid, iter->cpu); - if (!ret) - return 0; - if (trace_flags & TRACE_ITER_IRQ_INFO) { - ret = trace_print_lat_fmt(s, entry); - if (!ret) - return 0; - } + if (trace_flags & TRACE_ITER_IRQ_INFO) + trace_print_lat_fmt(s, entry); if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { t = ns2usecs(iter->ts); usec_rem = do_div(t, USEC_PER_SEC); secs = (unsigned long)t; - return trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); + trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); } else - return trace_seq_printf(s, " %12llu: ", iter->ts); + trace_seq_printf(s, " %12llu: ", iter->ts); + + return !trace_seq_has_overflowed(s); } int trace_print_lat_context(struct trace_iterator *iter) { u64 next_ts; - int ret; /* trace_find_next_entry will reset ent_size */ int ent_size = iter->ent_size; struct trace_seq *s = &iter->seq; @@ -567,18 +563,17 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf( - s, "%16s %5d %3d %d %08x %08lx ", - comm, entry->pid, iter->cpu, entry->flags, - entry->preempt_count, iter->idx); + trace_seq_printf( + s, "%16s %5d %3d %d %08x %08lx ", + comm, entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx); } else { - ret = lat_print_generic(s, entry, iter->cpu); + lat_print_generic(s, entry, iter->cpu); } - if (ret) - ret = lat_print_timestamp(iter, next_ts); + lat_print_timestamp(iter, next_ts); - return ret; + return !trace_seq_has_overflowed(s); } static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; @@ -692,7 +687,7 @@ int register_ftrace_event(struct trace_event *event) goto out; } else { - + event->type = next_event_type++; list = &ftrace_event_list; } @@ -764,10 +759,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, struct trace_event *event) { - if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } /* TRACE_FN */ @@ -779,24 +773,16 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - if (!trace_seq_puts(s, " <-")) - goto partial; - if (!seq_print_ip_sym(s, - field->parent_ip, - flags)) - goto partial; + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); } - if (!trace_seq_putc(s, '\n')) - goto partial; - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, @@ -806,12 +792,11 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "%lx %lx\n", - field->ip, - field->parent_ip)) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(&iter->seq); } static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, @@ -822,10 +807,10 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_HEX_FIELD_RET(s, field->ip); - SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + SEQ_PUT_HEX_FIELD(s, field->ip); + SEQ_PUT_HEX_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, @@ -836,10 +821,10 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->ip); - SEQ_PUT_FIELD_RET(s, field->parent_ip); + SEQ_PUT_FIELD(s, field->ip); + SEQ_PUT_FIELD(s, field->parent_ip); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_fn_funcs = { @@ -868,18 +853,17 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, T = task_state_char(field->next_state); S = task_state_char(field->prev_state); trace_find_cmdline(field->next_pid, comm); - if (!trace_seq_printf(&iter->seq, - " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", - field->prev_pid, - field->prev_prio, - S, delim, - field->next_cpu, - field->next_pid, - field->next_prio, - T, comm)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, @@ -904,17 +888,16 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) if (!S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", - field->prev_pid, - field->prev_prio, - S, - field->next_cpu, - field->next_pid, - field->next_prio, - T)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T); + + return trace_handle_return(&iter->seq); } static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, @@ -942,15 +925,15 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) S = task_state_char(field->prev_state); T = task_state_char(field->next_state); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); - SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); - SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); - SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); - SEQ_PUT_HEX_FIELD_RET(s, T); + SEQ_PUT_HEX_FIELD(s, field->prev_pid); + SEQ_PUT_HEX_FIELD(s, field->prev_prio); + SEQ_PUT_HEX_FIELD(s, S); + SEQ_PUT_HEX_FIELD(s, field->next_cpu); + SEQ_PUT_HEX_FIELD(s, field->next_pid); + SEQ_PUT_HEX_FIELD(s, field->next_prio); + SEQ_PUT_HEX_FIELD(s, T); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, @@ -973,14 +956,15 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - SEQ_PUT_FIELD_RET(s, field->prev_pid); - SEQ_PUT_FIELD_RET(s, field->prev_prio); - SEQ_PUT_FIELD_RET(s, field->prev_state); - SEQ_PUT_FIELD_RET(s, field->next_pid); - SEQ_PUT_FIELD_RET(s, field->next_prio); - SEQ_PUT_FIELD_RET(s, field->next_state); + SEQ_PUT_FIELD(s, field->prev_pid); + SEQ_PUT_FIELD(s, field->prev_prio); + SEQ_PUT_FIELD(s, field->prev_state); + SEQ_PUT_FIELD(s, field->next_cpu); + SEQ_PUT_FIELD(s, field->next_pid); + SEQ_PUT_FIELD(s, field->next_prio); + SEQ_PUT_FIELD(s, field->next_state); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static struct trace_event_functions trace_ctx_funcs = { @@ -1020,23 +1004,19 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); end = (unsigned long *)((long)iter->ent + iter->ent_size); - if (!trace_seq_puts(s, "<stack trace>\n")) - goto partial; + trace_seq_puts(s, "<stack trace>\n"); for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { - if (!trace_seq_puts(s, " => ")) - goto partial; - if (!seq_print_ip_sym(s, *p, flags)) - goto partial; - if (!trace_seq_putc(s, '\n')) - goto partial; - } + if (trace_seq_has_overflowed(s)) + break; - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, " => "); + seq_print_ip_sym(s, *p, flags); + trace_seq_putc(s, '\n'); + } - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_stack_funcs = { @@ -1057,16 +1037,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "<user stack trace>\n")) - goto partial; - - if (!seq_print_userip_objs(field, s, flags)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_puts(s, "<user stack trace>\n"); + seq_print_userip_objs(field, s, flags); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_user_stack_funcs = { @@ -1089,19 +1063,11 @@ trace_bputs_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_puts(s, field->str); - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_puts(s, field->str)) - goto partial; - - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1114,16 +1080,10 @@ trace_bputs_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_puts(s, field->str)) - goto partial; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_puts(s, field->str); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bputs_funcs = { @@ -1147,19 +1107,11 @@ trace_bprint_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_puts(s, ": ")) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, ": "); + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } @@ -1172,16 +1124,10 @@ trace_bprint_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(s, ": %lx : ", field->ip)) - goto partial; - - if (!trace_seq_bprintf(s, field->fmt, field->buf)) - goto partial; + trace_seq_printf(s, ": %lx : ", field->ip); + trace_seq_bprintf(s, field->fmt, field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static struct trace_event_functions trace_bprint_funcs = { @@ -1203,16 +1149,10 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_ip_sym(s, field->ip, flags)) - goto partial; - - if (!trace_seq_printf(s, ": %s", field->buf)) - goto partial; + seq_print_ip_sym(s, field->ip, flags); + trace_seq_printf(s, ": %s", field->buf); - return TRACE_TYPE_HANDLED; - - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(s); } static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, @@ -1222,13 +1162,9 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) - goto partial; - - return TRACE_TYPE_HANDLED; + trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); - partial: - return TRACE_TYPE_PARTIAL_LINE; + return trace_handle_return(&iter->seq); } static struct trace_event_functions trace_print_funcs = { diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 80b25b585a7..8ef2c40efb3 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,17 +35,11 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); extern int __unregister_ftrace_event(struct trace_event *event); extern struct rw_semaphore trace_event_sem; -#define SEQ_PUT_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) - -#define SEQ_PUT_HEX_FIELD_RET(s, x) \ -do { \ - if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ - return TRACE_TYPE_PARTIAL_LINE; \ -} while (0) +#define SEQ_PUT_FIELD(s, x) \ + trace_seq_putmem(s, &(x), sizeof(x)) + +#define SEQ_PUT_HEX_FIELD(s, x) \ + trace_seq_putmem_hex(s, &(x), sizeof(x)) #endif diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65..c4e70b6bd7f 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -305,7 +305,7 @@ static int t_show(struct seq_file *m, void *v) seq_puts(m, "\\t"); break; case '\\': - seq_puts(m, "\\"); + seq_putc(m, '\\'); break; case '"': seq_puts(m, "\\\""); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d4b9fc22cd2..b983b2fd2ca 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -40,7 +40,8 @@ const char *reserved_field_names[] = { int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \ void *data, void *ent) \ { \ - return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \ + return !trace_seq_has_overflowed(s); \ } \ const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \ NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type)); @@ -61,10 +62,11 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name, int len = *(u32 *)data >> 16; if (!len) - return trace_seq_printf(s, " %s=(fault)", name); + trace_seq_printf(s, " %s=(fault)", name); else - return trace_seq_printf(s, " %s=\"%s\"", name, - (const char *)get_loc_data(data, ent)); + trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); + return !trace_seq_has_overflowed(s); } NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string)); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3f34dc9b40f..2e293beb186 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -14,122 +14,26 @@ #include "trace.h" -static struct trace_array *ctx_trace; -static int __read_mostly tracer_enabled; static int sched_ref; static DEFINE_MUTEX(sched_register_mutex); -static int sched_stopped; - - -void -tracing_sched_switch_trace(struct trace_array *tr, - struct task_struct *prev, - struct task_struct *next, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_CTX, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = prev->pid; - entry->prev_prio = prev->prio; - entry->prev_state = prev->state; - entry->next_pid = next->pid; - entry->next_prio = next->prio; - entry->next_state = next->state; - entry->next_cpu = task_cpu(next); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); -} static void probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); - - local_irq_restore(flags); -} - -void -tracing_sched_wakeup_trace(struct trace_array *tr, - struct task_struct *wakee, - struct task_struct *curr, - unsigned long flags, int pc) -{ - struct ftrace_event_call *call = &event_wakeup; - struct ring_buffer_event *event; - struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->trace_buffer.buffer; - - event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, - sizeof(*entry), flags, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->prev_pid = curr->pid; - entry->prev_prio = curr->prio; - entry->prev_state = curr->state; - entry->next_pid = wakee->pid; - entry->next_prio = wakee->prio; - entry->next_state = wakee->state; - entry->next_cpu = task_cpu(wakee); - - if (!call_filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, flags, pc); } static void probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { - struct trace_array_cpu *data; - unsigned long flags; - int cpu, pc; - if (unlikely(!sched_ref)) return; tracing_record_cmdline(current); - - if (!tracer_enabled || sched_stopped) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); - - if (likely(!atomic_read(&data->disabled))) - tracing_sched_wakeup_trace(ctx_trace, wakee, current, - flags, pc); - - local_irq_restore(flags); } static int tracing_sched_register(void) @@ -197,51 +101,3 @@ void tracing_stop_cmdline_record(void) { tracing_stop_sched_switch(); } - -/** - * tracing_start_sched_switch_record - start tracing context switches - * - * Turns on context switch tracing for a tracer. - */ -void tracing_start_sched_switch_record(void) -{ - if (unlikely(!ctx_trace)) { - WARN_ON(1); - return; - } - - tracing_start_sched_switch(); - - mutex_lock(&sched_register_mutex); - tracer_enabled++; - mutex_unlock(&sched_register_mutex); -} - -/** - * tracing_stop_sched_switch_record - start tracing context switches - * - * Turns off context switch tracing for a tracer. - */ -void tracing_stop_sched_switch_record(void) -{ - mutex_lock(&sched_register_mutex); - tracer_enabled--; - WARN_ON(tracer_enabled < 0); - mutex_unlock(&sched_register_mutex); - - tracing_stop_sched_switch(); -} - -/** - * tracing_sched_switch_assign_trace - assign a trace array for ctx switch - * @tr: trace array pointer to assign - * - * Some tracers might want to record the context switches in their - * trace. This function lets those tracers assign the trace array - * to use. - */ -void tracing_sched_switch_assign_trace(struct trace_array *tr) -{ - ctx_trace = tr; -} - diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 19bd8928ce9..8fb84b36281 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -365,6 +365,62 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) wakeup_current_cpu = cpu; } +static void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->trace_buffer.buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!call_filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + static void notrace probe_wakeup_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1f24ed99dca..f8b45d8792f 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -27,10 +27,19 @@ #include <linux/trace_seq.h> /* How much buffer is left on the trace_seq? */ -#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) +#define TRACE_SEQ_BUF_LEFT(s) seq_buf_buffer_left(&(s)->seq) /* How much buffer is written? */ -#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) +#define TRACE_SEQ_BUF_USED(s) seq_buf_used(&(s)->seq) + +/* + * trace_seq should work with being initialized with 0s. + */ +static inline void __trace_seq_init(struct trace_seq *s) +{ + if (unlikely(!s->seq.size)) + trace_seq_init(s); +} /** * trace_print_seq - move the contents of trace_seq into a seq_file @@ -43,10 +52,11 @@ */ int trace_print_seq(struct seq_file *m, struct trace_seq *s) { - unsigned int len = TRACE_SEQ_BUF_USED(s); int ret; - ret = seq_write(m, s->buffer, len); + __trace_seq_init(s); + + ret = seq_buf_print_seq(m, &s->seq); /* * Only reset this buffer if we successfully wrote to the @@ -69,34 +79,26 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. - * - * Returns 1 if we successfully written all the contents to - * the buffer. - * Returns 0 if we the length to write is bigger than the - * reserved buffer space. In this case, nothing gets written. */ -int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); + unsigned int save_len = s->seq.len; va_list ap; - int ret; - if (s->full || !len) - return 0; + if (s->full) + return; + + __trace_seq_init(s); va_start(ap, fmt); - ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + seq_buf_vprintf(&s->seq, fmt, ap); va_end(ap); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; } - - s->len += ret; - - return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); @@ -107,25 +109,23 @@ EXPORT_SYMBOL_GPL(trace_seq_printf); * @nmaskbits: The number of bits that are valid in @maskp * * Writes a ASCII representation of a bitmask string into @s. - * - * Returns 1 if we successfully written all the contents to - * the buffer. - * Returns 0 if we the length to write is bigger than the - * reserved buffer space. In this case, nothing gets written. */ -int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +void trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, int nmaskbits) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; - ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); - s->len += ret; + __trace_seq_init(s); - return 1; + seq_buf_bitmask(&s->seq, maskp, nmaskbits); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + } } EXPORT_SYMBOL_GPL(trace_seq_bitmask); @@ -139,28 +139,23 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; - ret = vsnprintf(s->buffer + s->len, len, fmt, args); + __trace_seq_init(s); + + seq_buf_vprintf(&s->seq, fmt, args); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; } - - s->len += ret; - - return len; } EXPORT_SYMBOL_GPL(trace_seq_vprintf); @@ -178,28 +173,24 @@ EXPORT_SYMBOL_GPL(trace_seq_vprintf); * * This function will take the format and the binary array and finish * the conversion into the ASCII string within the buffer. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { - unsigned int len = TRACE_SEQ_BUF_LEFT(s); - int ret; + unsigned int save_len = s->seq.len; - if (s->full || !len) - return 0; + if (s->full) + return; + + __trace_seq_init(s); - ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + seq_buf_bprintf(&s->seq, fmt, binary); /* If we can't write it all, don't bother writing anything */ - if (ret >= len) { + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; s->full = 1; - return 0; + return; } - - s->len += ret; - - return len; } EXPORT_SYMBOL_GPL(trace_seq_bprintf); @@ -212,25 +203,22 @@ EXPORT_SYMBOL_GPL(trace_seq_bprintf); * copy to user routines. This function records a simple string * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_puts(struct trace_seq *s, const char *str) +void trace_seq_puts(struct trace_seq *s, const char *str) { unsigned int len = strlen(str); if (s->full) - return 0; + return; + + __trace_seq_init(s); if (len > TRACE_SEQ_BUF_LEFT(s)) { s->full = 1; - return 0; + return; } - memcpy(s->buffer + s->len, str, len); - s->len += len; - - return len; + seq_buf_putmem(&s->seq, str, len); } EXPORT_SYMBOL_GPL(trace_seq_puts); @@ -243,22 +231,20 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * copy to user routines. This function records a simple charater * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putc(struct trace_seq *s, unsigned char c) +void trace_seq_putc(struct trace_seq *s, unsigned char c) { if (s->full) - return 0; + return; + + __trace_seq_init(s); if (TRACE_SEQ_BUF_LEFT(s) < 1) { s->full = 1; - return 0; + return; } - s->buffer[s->len++] = c; - - return 1; + seq_buf_putc(&s->seq, c); } EXPORT_SYMBOL_GPL(trace_seq_putc); @@ -271,29 +257,23 @@ EXPORT_SYMBOL_GPL(trace_seq_putc); * There may be cases where raw memory needs to be written into the * buffer and a strcpy() would not work. Using this function allows * for such cases. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +void trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) { if (s->full) - return 0; + return; + + __trace_seq_init(s); if (len > TRACE_SEQ_BUF_LEFT(s)) { s->full = 1; - return 0; + return; } - memcpy(s->buffer + s->len, mem, len); - s->len += len; - - return len; + seq_buf_putmem(&s->seq, mem, len); } EXPORT_SYMBOL_GPL(trace_seq_putmem); -#define MAX_MEMHEX_BYTES 8U -#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) - /** * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex * @s: trace sequence descriptor @@ -303,41 +283,31 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem); * This is similar to trace_seq_putmem() except instead of just copying the * raw memory into the buffer it writes its ASCII representation of it * in hex characters. - * - * Returns how much it wrote to the buffer. */ -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, +void trace_seq_putmem_hex(struct trace_seq *s, const void *mem, unsigned int len) { - unsigned char hex[HEX_CHARS]; - const unsigned char *data = mem; - unsigned int start_len; - int i, j; - int cnt = 0; + unsigned int save_len = s->seq.len; if (s->full) - return 0; + return; - while (len) { - start_len = min(len, HEX_CHARS - 1); -#ifdef __BIG_ENDIAN - for (i = 0, j = 0; i < start_len; i++) { -#else - for (i = start_len-1, j = 0; i >= 0; i--) { -#endif - hex[j++] = hex_asc_hi(data[i]); - hex[j++] = hex_asc_lo(data[i]); - } - if (WARN_ON_ONCE(j == 0 || j/2 > len)) - break; - - /* j increments twice per loop */ - len -= j / 2; - hex[j++] = ' '; - - cnt += trace_seq_putmem(s, hex, j); + __trace_seq_init(s); + + /* Each byte is represented by two chars */ + if (len * 2 > TRACE_SEQ_BUF_LEFT(s)) { + s->full = 1; + return; + } + + /* The added spaces can still cause an overflow */ + seq_buf_putmem_hex(&s->seq, mem, len); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return; } - return cnt; } EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); @@ -355,30 +325,27 @@ EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); */ int trace_seq_path(struct trace_seq *s, const struct path *path) { - unsigned char *p; + unsigned int save_len = s->seq.len; if (s->full) return 0; + __trace_seq_init(s); + if (TRACE_SEQ_BUF_LEFT(s) < 1) { s->full = 1; return 0; } - p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); - if (!IS_ERR(p)) { - p = mangle_path(s->buffer + s->len, p, "\n"); - if (p) { - s->len = p - s->buffer; - return 1; - } - } else { - s->buffer[s->len++] = '?'; - return 1; + seq_buf_path(&s->seq, path, "\n"); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; } - s->full = 1; - return 0; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_path); @@ -404,25 +371,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); */ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) { - int len; - int ret; - - if (!cnt) - return 0; - - if (s->len <= s->readpos) - return -EBUSY; - - len = s->len - s->readpos; - if (cnt > len) - cnt = len; - ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); - if (ret == cnt) - return -EFAULT; - - cnt -= ret; - - s->readpos += cnt; - return cnt; + __trace_seq_init(s); + return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 29228c4d569..c6ee36fcbf9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -114,7 +114,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_entry *ent = iter->ent; struct syscall_trace_enter *trace; struct syscall_metadata *entry; - int i, ret, syscall; + int i, syscall; trace = (typeof(trace))ent; syscall = trace->nr; @@ -128,35 +128,28 @@ print_syscall_enter(struct trace_iterator *iter, int flags, goto end; } - ret = trace_seq_printf(s, "%s(", entry->name); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s(", entry->name); for (i = 0; i < entry->nb_args; i++) { + + if (trace_seq_has_overflowed(s)) + goto end; + /* parameter types */ - if (trace_flags & TRACE_ITER_VERBOSE) { - ret = trace_seq_printf(s, "%s ", entry->types[i]); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - } + if (trace_flags & TRACE_ITER_VERBOSE) + trace_seq_printf(s, "%s ", entry->types[i]); + /* parameter values */ - ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], - trace->args[i], - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); } - ret = trace_seq_putc(s, ')'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - + trace_seq_putc(s, ')'); end: - ret = trace_seq_putc(s, '\n'); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + return trace_handle_return(s); } static enum print_line_t @@ -168,7 +161,6 @@ print_syscall_exit(struct trace_iterator *iter, int flags, struct syscall_trace_exit *trace; int syscall; struct syscall_metadata *entry; - int ret; trace = (typeof(trace))ent; syscall = trace->nr; @@ -176,7 +168,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, if (!entry) { trace_seq_putc(s, '\n'); - return TRACE_TYPE_HANDLED; + goto out; } if (entry->exit_event->event.type != ent->type) { @@ -184,12 +176,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, return TRACE_TYPE_UNHANDLED; } - ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, trace->ret); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + out: + return trace_handle_return(s); } extern char *__bad_type_size(void); @@ -523,7 +514,7 @@ unsigned long __init __weak arch_syscall_addr(int nr) return (unsigned long)sys_call_table[nr]; } -static int __init init_ftrace_syscalls(void) +void __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; unsigned long addr; @@ -533,7 +524,7 @@ static int __init init_ftrace_syscalls(void) GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); - return -ENOMEM; + return; } for (i = 0; i < NR_syscalls; i++) { @@ -545,10 +536,7 @@ static int __init init_ftrace_syscalls(void) meta->syscall_nr = i; syscalls_metadata[i] = meta; } - - return 0; } -early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 33ff6a24b80..8520acc34b1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -552,8 +552,7 @@ error: return ret; fail_address_parse: - if (inode) - iput(inode); + iput(inode); pr_info("Failed to parse address or file.\n"); @@ -606,7 +605,7 @@ static int probes_seq_show(struct seq_file *m, void *v) for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -852,16 +851,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e tu = container_of(event, struct trace_uprobe, tp.call.event); if (is_ret_probe(tu)) { - if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[1], entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[1], entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, true); } else { - if (!trace_seq_printf(s, "%s: (0x%lx)", - ftrace_event_name(&tu->tp.call), - entry->vaddr[0])) - goto partial; + trace_seq_printf(s, "%s: (0x%lx)", + ftrace_event_name(&tu->tp.call), + entry->vaddr[0]); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -869,14 +866,13 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e struct probe_arg *parg = &tu->tp.args[i]; if (!parg->type->print(s, parg->name, data + parg->offset, entry)) - goto partial; + goto out; } - if (trace_seq_puts(s, "\n")) - return TRACE_TYPE_HANDLED; + trace_seq_putc(s, '\n'); -partial: - return TRACE_TYPE_PARTIAL_LINE; + out: + return trace_handle_return(s); } typedef bool (*filter_func_t)(struct uprobe_consumer *self, diff --git a/kernel/uid16.c b/kernel/uid16.c index 602e5bbbcef..d58cc4d8f0d 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!ns_capable(current_user_ns(), CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/user.c b/kernel/user.c index 4efa39350e4..b069ccbfb0b 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -50,7 +50,11 @@ struct user_namespace init_user_ns = { .count = ATOMIC_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .proc_inum = PROC_USER_INIT_INO, + .ns.inum = PROC_USER_INIT_INO, +#ifdef CONFIG_USER_NS + .ns.ops = &userns_operations, +#endif + .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_PERSISTENT_KEYRINGS .persistent_keyring_register_sem = __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index aa312b0dc3e..4109f832068 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -24,6 +24,7 @@ #include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; +static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, @@ -86,11 +87,12 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; - ret = proc_alloc_inum(&ns->proc_inum); + ret = ns_alloc_inum(&ns->ns); if (ret) { kmem_cache_free(user_ns_cachep, ns); return ret; } + ns->ns.ops = &userns_operations; atomic_set(&ns->count, 1); /* Leave the new->user_ns reference with the new user namespace. */ @@ -99,6 +101,11 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ + mutex_lock(&userns_state_mutex); + ns->flags = parent_ns->flags; + mutex_unlock(&userns_state_mutex); + set_cred_user_ns(new, ns); #ifdef CONFIG_PERSISTENT_KEYRINGS @@ -136,7 +143,7 @@ void free_user_ns(struct user_namespace *ns) #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); ns = parent; } while (atomic_dec_and_test(&parent->count)); @@ -583,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, return false; } - -static DEFINE_MUTEX(id_map_mutex); - static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -602,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; /* - * The id_map_mutex serializes all writes to any given map. + * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * @@ -620,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); ret = -EPERM; /* Only allow one successful write to the map */ @@ -640,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (!page) goto out; - /* Only allow <= page size writes at the beginning of the file */ + /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; @@ -750,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, *ppos = count; ret = count; out: - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); if (page) free_page(page); return ret; @@ -812,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + const struct cred *cred = file->f_cred; + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && + uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->fsuid)) + if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, file->f_cred->fsgid)) + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && + gid_eq(gid, cred->egid)) return true; } } @@ -841,7 +850,106 @@ static bool new_idmap_permitted(const struct file *file, return false; } -static void *userns_get(struct task_struct *task) +int proc_setgroups_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + unsigned long userns_flags = ACCESS_ONCE(ns->flags); + + seq_printf(seq, "%s\n", + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? + "allow" : "deny"); + return 0; +} + +ssize_t proc_setgroups_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + char kbuf[8], *pos; + bool setgroups_allowed; + ssize_t ret; + + /* Only allow a very narrow range of strings to be written */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= sizeof(kbuf))) + goto out; + + /* What was written? */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + ret = -EINVAL; + if (strncmp(pos, "allow", 5) == 0) { + pos += 5; + setgroups_allowed = true; + } + else if (strncmp(pos, "deny", 4) == 0) { + pos += 4; + setgroups_allowed = false; + } + else + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + ret = -EPERM; + mutex_lock(&userns_state_mutex); + if (setgroups_allowed) { + /* Enabling setgroups after setgroups has been disabled + * is not allowed. + */ + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) + goto out_unlock; + } else { + /* Permanently disabling setgroups after setgroups has + * been enabled by writing the gid_map is not allowed. + */ + if (ns->gid_map.nr_extents != 0) + goto out_unlock; + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; + } + mutex_unlock(&userns_state_mutex); + + /* Report a successful write */ + *ppos = count; + ret = count; +out: + return ret; +out_unlock: + mutex_unlock(&userns_state_mutex); + goto out; +} + +bool userns_may_setgroups(const struct user_namespace *ns) +{ + bool allowed; + + mutex_lock(&userns_state_mutex); + /* It is not safe to use setgroups until a gid mapping in + * the user namespace has been established. + */ + allowed = ns->gid_map.nr_extents != 0; + /* Is setgroups allowed? */ + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); + mutex_unlock(&userns_state_mutex); + + return allowed; +} + +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + +static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; @@ -849,17 +957,17 @@ static void *userns_get(struct task_struct *task) user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); - return user_ns; + return user_ns ? &user_ns->ns : NULL; } -static void userns_put(void *ns) +static void userns_put(struct ns_common *ns) { - put_user_ns(ns); + put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, void *ns) +static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) { - struct user_namespace *user_ns = ns; + struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering @@ -888,19 +996,12 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) return commit_creds(cred); } -static unsigned int userns_inum(void *ns) -{ - struct user_namespace *user_ns = ns; - return user_ns->proc_inum; -} - const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, - .inum = userns_inum, }; static __init int user_namespaces_init(void) diff --git a/kernel/utsname.c b/kernel/utsname.c index 883aaaa7de8..831ea710823 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -42,12 +42,14 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) return ERR_PTR(-ENOMEM); - err = proc_alloc_inum(&ns->proc_inum); + err = ns_alloc_inum(&ns->ns); if (err) { kfree(ns); return ERR_PTR(err); } + ns->ns.ops = &utsns_operations; + down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); @@ -84,11 +86,16 @@ void free_uts_ns(struct kref *kref) ns = container_of(kref, struct uts_namespace, kref); put_user_ns(ns->user_ns); - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); kfree(ns); } -static void *utsns_get(struct task_struct *task) +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; @@ -101,17 +108,17 @@ static void *utsns_get(struct task_struct *task) } task_unlock(task); - return ns; + return ns ? &ns->ns : NULL; } -static void utsns_put(void *ns) +static void utsns_put(struct ns_common *ns) { - put_uts_ns(ns); + put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, void *new) +static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) { - struct uts_namespace *ns = new; + struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) @@ -123,18 +130,10 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) return 0; } -static unsigned int utsns_inum(void *vp) -{ - struct uts_namespace *ns = vp; - - return ns->proc_inum; -} - const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, - .inum = utsns_inum, }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09b685daee3..6202b08f193 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1804,8 +1804,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); + spin_lock(&wq_mayday_lock); /* for wq->maydays */ if (need_to_create_worker(pool)) { /* @@ -1818,8 +1818,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock(&pool->lock); - spin_unlock_irq(&wq_mayday_lock); + spin_unlock(&wq_mayday_lock); + spin_unlock_irq(&pool->lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2248,12 +2248,30 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); - process_scheduled_works(rescuer); + if (!list_empty(scheduled)) { + process_scheduled_works(rescuer); + + /* + * The above execution of rescued work items could + * have created more to rescue through + * pwq_activate_first_delayed() or chained + * queueing. Let's put @pwq back on mayday list so + * that such back-to-back work items, which may be + * being used to relieve memory pressure, don't + * incur MAYDAY_INTERVAL delay inbetween. + */ + if (need_to_create_worker(pool)) { + spin_lock(&wq_mayday_lock); + get_pwq(pwq); + list_move_tail(&pwq->mayday_node, &wq->maydays); + spin_unlock(&wq_mayday_lock); + } + } /* * Put the reference grabbed by send_mayday(). @pool won't |