diff options
Diffstat (limited to 'kernel')
65 files changed, 1516 insertions, 953 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index d96045789b5..77770a034d5 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) struct task_struct *tsk; int err; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - err = -ESRCH; - if (!tsk) - goto out; - err = 0; - - spin_lock_irq(&tsk->sighand->siglock); - if (!tsk->signal->audit_tty) - err = -EPERM; - spin_unlock_irq(&tsk->sighand->siglock); - if (err) - goto out; - - tty_audit_push_task(tsk, loginuid, sessionid); -out: - read_unlock(&tasklist_lock); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + err = tty_audit_push_task(tsk, loginuid, sessionid); + put_task_struct(tsk); return err; } @@ -506,7 +499,7 @@ int audit_send_list(void *_dest) } struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, - int multi, void *payload, int size) + int multi, const void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; @@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg) * Allocates an skb, builds the netlink message, and sends it to the pid. * No failure notifications. */ -void audit_send_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size) +static void audit_send_reply(int pid, int seq, int type, int done, int multi, + const void *payload, int size) { struct sk_buff *skb; struct task_struct *tsk; @@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_TTY_GET: { struct audit_tty_status s; struct task_struct *tsk; + unsigned long flags; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { s.enabled = tsk->signal->audit_tty != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0, - &s, sizeof(s)); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status *s; struct task_struct *tsk; + unsigned long flags; if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) return -EINVAL; s = data; if (s->enabled != 0 && s->enabled != 1) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk) - err = -ESRCH; - else { - spin_lock_irq(&tsk->sighand->siglock); + if (tsk && lock_task_sighand(tsk, &flags)) { tsk->signal->audit_tty = s->enabled != 0; - spin_unlock_irq(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); break; } default: diff --git a/kernel/audit.h b/kernel/audit.h index f7206db4e13..91e7071c4d2 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path, int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, - void *payload, int size); -extern void audit_send_reply(int pid, int seq, int type, - int done, int multi, - void *payload, int size); + const void *payload, int size); extern void audit_panic(const char *message); struct audit_netlink_list { diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7f18d3a4527..37b2bea170c 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -223,7 +223,7 @@ static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); struct fsnotify_mark *entry = &chunk->mark; - struct audit_chunk *new; + struct audit_chunk *new = NULL; struct audit_tree *owner; int size = chunk->count - 1; int i, j; @@ -232,9 +232,14 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); + if (size) + new = alloc_chunk(size); + spin_lock(&entry->lock); if (chunk->dead || !entry->i.inode) { spin_unlock(&entry->lock); + if (new) + free_chunk(new); goto out; } @@ -255,9 +260,9 @@ static void untag_chunk(struct node *p) goto out; } - new = alloc_chunk(size); if (!new) goto Fallback; + fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f0c9b2e7542..d2e3c786646 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -60,7 +60,7 @@ struct audit_parent { }; /* fsnotify handle. */ -struct fsnotify_group *audit_watch_group; +static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ @@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch) } } -void audit_remove_watch(struct audit_watch *watch) +static void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); audit_put_parent(watch->parent); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index eb7675499fb..add2819af71 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, case AUDIT_LOGINUID: result = audit_comparator(cb->loginuid, f->op, f->val); break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) + result = security_audit_rule_match(cb->sid, + f->type, + f->op, + f->lsm_rule, + NULL); + break; } if (!result) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1b31c130d03..f49a0318c2e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -241,6 +241,10 @@ struct audit_context { pid_t pid; struct audit_cap_data cap; } capset; + struct { + int fd; + int flags; + } mmap; }; int fds[2]; @@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); break; } + case AUDIT_MMAP: { + audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, + context->mmap.flags); + break; } } audit_log_end(ab); } @@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid, context->type = AUDIT_CAPSET; } +void __audit_mmap_fd(int fd, int flags) +{ + struct audit_context *context = current->audit_context; + context->mmap.fd = fd; + context->mmap.flags = flags; + context->type = AUDIT_MMAP; +} + /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b69b8d0313..66a416b42c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -243,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -777,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { + inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); @@ -1039,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (clone_children(&root->top_cgroup)) + seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); @@ -1049,6 +1057,7 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + bool clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1065,7 +1074,8 @@ struct cgroup_sb_opts { */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { - char *token, *o = data ?: "all"; + char *token, *o = data; + bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; int i; bool module_pin_failed = false; @@ -1081,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) while ((token = strsep(&o, ",")) != NULL) { if (!*token) return -EINVAL; - if (!strcmp(token, "all")) { - /* Add all non-disabled subsystems */ - opts->subsys_bits = 0; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss == NULL) - continue; - if (!ss->disabled) - opts->subsys_bits |= 1ul << i; - } - } else if (!strcmp(token, "none")) { + if (!strcmp(token, "none")) { /* Explicitly have no subsystems */ opts->none = true; - } else if (!strcmp(token, "noprefix")) { + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); - } else if (!strncmp(token, "release_agent=", 14)) { + continue; + } + if (!strcmp(token, "clone_children")) { + opts->clone_children = true; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; @@ -1104,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - } else if (!strncmp(token, "name=", 5)) { + continue; + } + if (!strncmp(token, "name=", 5)) { const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1126,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) GFP_KERNEL); if (!opts->name) return -ENOMEM; - } else { - struct cgroup_subsys *ss; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - ss = subsys[i]; - if (ss == NULL) - continue; - if (!strcmp(token, ss->name)) { - if (!ss->disabled) - set_bit(i, &opts->subsys_bits); - break; - } - } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (strcmp(token, ss->name)) + continue; + if (ss->disabled) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + set_bit(i, &opts->subsys_bits); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise 'all, 'none' and a subsystem name options were not + * specified, let's default to 'all' + */ + if (all_ss || (!all_ss && !one_ss && !opts->none)) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->disabled) + continue; + set_bit(i, &opts->subsys_bits); } } @@ -1354,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); + if (opts->clone_children) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -1417,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb) return 0; } -static int cgroup_get_sb(struct file_system_type *fs_type, +static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct cgroup_sb_opts opts; struct cgroupfs_root *root; @@ -1553,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_parsed_module_refcounts(opts.subsys_bits); } - simple_set_mnt(mnt, sb); kfree(opts.release_agent); kfree(opts.name); - return 0; + return dget(sb->s_root); drop_new_super: deactivate_locked_super(sb); @@ -1565,7 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, out_err: kfree(opts.release_agent); kfree(opts.name); - return ret; + return ERR_PTR(ret); } static void cgroup_kill_sb(struct super_block *sb) { @@ -1615,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) { static struct file_system_type cgroup_fs_type = { .name = "cgroup", - .get_sb = cgroup_get_sb, + .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, }; @@ -1879,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (strlen(buffer) >= PATH_MAX) + return -EINVAL; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; strcpy(cgrp->root->release_agent_path, buffer); @@ -3172,6 +3216,23 @@ fail: return ret; } +static u64 cgroup_clone_children_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + if (val) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + else + clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + return 0; +} + /* * for the common functions, 'private' gives the type of file */ @@ -3202,6 +3263,11 @@ static struct cftype files[] = { .write_string = cgroup_write_event_control, .mode = S_IWUGO, }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, }; static struct cftype cft_release_agent = { @@ -3331,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + if (clone_children(parent)) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); @@ -3345,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_destroy; } /* At error, ->destroy() callback has to free assigned ID. */ + if (clone_children(parent) && ss->post_clone) + ss->post_clone(ss, cgrp); } cgroup_lock_hierarchy(root); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ce71ed53e88..e7bebb7c6c3 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } -int cgroup_freezing_or_frozen(struct task_struct *task) +static inline int __cgroup_freezing_or_frozen(struct task_struct *task) { - struct freezer *freezer; - enum freezer_state state; + enum freezer_state state = task_freezer(task)->state; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +} +int cgroup_freezing_or_frozen(struct task_struct *task) +{ + int result; task_lock(task); - freezer = task_freezer(task); - if (!freezer->css.cgroup->parent) - state = CGROUP_THAWED; /* root cgroup can't be frozen */ - else - state = freezer->state; + result = __cgroup_freezing_or_frozen(task); task_unlock(task); - - return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); + return result; } /* @@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss, kfree(cgroup_freezer(cgroup)); } -/* Task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss, /* * Anything frozen can't move or be moved to/from. - * - * Since orig_freezer->state == FROZEN means that @task has been - * frozen, so it's sufficient to check the latter condition. */ - if (is_task_frozen_enough(task)) + freezer = cgroup_freezer(new_cgroup); + if (freezer->state != CGROUP_THAWED) return -EBUSY; - freezer = cgroup_freezer(new_cgroup); - if (freezer->state == CGROUP_FROZEN) + rcu_read_lock(); + if (__cgroup_freezing_or_frozen(task)) { + rcu_read_unlock(); return -EBUSY; + } + rcu_read_unlock(); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (is_task_frozen_enough(c)) { + if (__cgroup_freezing_or_frozen(c)) { rcu_read_unlock(); return -EBUSY; } @@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) /* * caller must hold freezer->lock */ -static void update_freezer_state(struct cgroup *cgroup, +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; unsigned int nfrozen = 0, ntotal = 0; + enum freezer_state old_state = freezer->state; cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (is_task_frozen_enough(task)) + if (frozen(task)) nfrozen++; } - /* - * Transition to FROZEN when no new tasks can be added ensures - * that we never exist in the FROZEN state while there are unfrozen - * tasks. - */ - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - else if (nfrozen > 0) - freezer->state = CGROUP_FREEZING; - else - freezer->state = CGROUP_THAWED; + if (old_state == CGROUP_THAWED) { + BUG_ON(nfrozen > 0); + } else if (old_state == CGROUP_FREEZING) { + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + } else { /* old_state == CGROUP_FROZEN */ + BUG_ON(nfrozen != ntotal); + } + cgroup_iter_end(cgroup, &it); } @@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, if (state == CGROUP_FREEZING) { /* We change from FREEZING to FROZEN lazily if the cgroup was * only partially frozen when we exitted write. */ - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); state = freezer->state; } spin_unlock_irq(&freezer->lock); @@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) while ((task = cgroup_iter_next(cgroup, &it))) { if (!freeze_task(task, true)) continue; - if (is_task_frozen_enough(task)) + if (frozen(task)) continue; if (!freezing(task) && !freezer_should_skip(task)) num_cant_freeze_now++; @@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); - update_freezer_state(cgroup, freezer); + update_if_frozen(cgroup, freezer); if (goal_state == freezer->state) goto out; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 51b143e2a07..4349935c2ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock); * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static int cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data, struct vfsmount *mnt) +static struct dentry *cpuset_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, void *data) { struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - int ret = -ENODEV; + struct dentry *ret = ERR_PTR(-ENODEV); if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->get_sb(cgroup_fs, flags, - unused_dev_name, mountopts, mnt); + ret = cgroup_fs->mount(cgroup_fs, flags, + unused_dev_name, mountopts); put_filesystem(cgroup_fs); } return ret; @@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type, static struct file_system_type cpuset_fs_type = { .name = "cpuset", - .get_sb = cpuset_get_sb, + .mount = cpuset_mount, }; /* diff --git a/kernel/cred.c b/kernel/cred.c index 9a3e22641fe..6a1aa004e37 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_guard_mutex + * - The caller must hold ->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_guard_mutex); - if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index fec596da9bd..cefd4a11f6d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -209,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs) return 0; } -/** - * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. - * @regs: Current &struct pt_regs. - * - * This function will be called if the particular architecture must - * disable hardware debugging while it is processing gdb packets or - * handling exception. - */ -void __weak kgdb_disable_hw_debug(struct pt_regs *regs) -{ -} - /* * Some architectures need cache flushes when we set/clear a * breakpoint: @@ -484,7 +472,9 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, atomic_inc(&masters_in_kgdb); else atomic_inc(&slaves_in_kgdb); - kgdb_disable_hw_debug(ks->linux_regs); + + if (arch_kgdb_ops.disable_hw_break) + arch_kgdb_ops.disable_hw_break(regs); acquirelock: /* diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index d7bda21a106..a6e72976682 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -82,7 +82,7 @@ static kdbtab_t kdb_base_commands[50]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ - num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++) + num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) typedef struct _kdbmsg { int km_diag; /* kdb diagnostic */ @@ -646,7 +646,7 @@ static int kdb_defcmd2(const char *cmdstr, const char *argv0) } if (!s->usable) return KDB_NOTIMP; - s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); + s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); if (!s->command) { kdb_printf("Could not allocate new kdb_defcmd table for %s\n", cmdstr); @@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, /* special case below */ } else { kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", - kdb_current, kdb_current->pid); + kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); #endif @@ -2361,7 +2361,7 @@ static int kdb_pid(int argc, const char **argv) */ static int kdb_ll(int argc, const char **argv) { - int diag; + int diag = 0; unsigned long addr; long offset = 0; unsigned long va; @@ -2400,20 +2400,21 @@ static int kdb_ll(int argc, const char **argv) char buf[80]; if (KDB_FLAG(CMD_INTERRUPT)) - return 0; + goto out; sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) - return diag; + goto out; addr = va + linkoffset; if (kdb_getword(&va, addr, sizeof(va))) - return 0; + goto out; } - kfree(command); - return 0; +out: + kfree(command); + return diag; } static int kdb_kgdb(int argc, const char **argv) @@ -2603,20 +2604,17 @@ static int kdb_summary(int argc, const char **argv) */ static int kdb_per_cpu(int argc, const char **argv) { - char buf[256], fmtstr[64]; - kdb_symtab_t symtab; - cpumask_t suppress = CPU_MASK_NONE; - int cpu, diag; - unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL; + char fmtstr[64]; + int cpu, diag, nextarg = 1; + unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; if (argc < 1 || argc > 3) return KDB_ARGCOUNT; - snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]); - if (!kdbgetsymval(buf, &symtab)) { - kdb_printf("%s is not a per_cpu variable\n", argv[1]); - return KDB_BADADDR; - } + diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); + if (diag) + return diag; + if (argc >= 2) { diag = kdbgetularg(argv[2], &bytesperword); if (diag) @@ -2649,46 +2647,25 @@ static int kdb_per_cpu(int argc, const char **argv) #define KDB_PCU(cpu) 0 #endif #endif - for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (whichcpu != ~0UL && whichcpu != cpu) continue; - addr = symtab.sym_start + KDB_PCU(cpu); + addr = symaddr + KDB_PCU(cpu); diag = kdb_getword(&val, addr, bytesperword); if (diag) { kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " "read, diag=%d\n", cpu, addr, diag); continue; } -#ifdef CONFIG_SMP - if (!val) { - cpu_set(cpu, suppress); - continue; - } -#endif /* CONFIG_SMP */ kdb_printf("%5d ", cpu); kdb_md_line(fmtstr, addr, bytesperword == KDB_WORD_SIZE, 1, bytesperword, 1, 1, 0); } - if (cpus_weight(suppress) == 0) - return 0; - kdb_printf("Zero suppressed cpu(s):"); - for (cpu = first_cpu(suppress); cpu < num_possible_cpus(); - cpu = next_cpu(cpu, suppress)) { - kdb_printf(" %d", cpu); - if (cpu == num_possible_cpus() - 1 || - next_cpu(cpu, suppress) != cpu + 1) - continue; - while (cpu < num_possible_cpus() && - next_cpu(cpu, suppress) == cpu + 1) - ++cpu; - kdb_printf("-%d", cpu); - } - kdb_printf("\n"); - #undef KDB_PCU - return 0; } @@ -2763,13 +2740,13 @@ int kdb_register_repeat(char *cmd, } if (kdb_commands) { memcpy(new, kdb_commands, - kdb_max_commands * sizeof(*new)); + (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); kfree(kdb_commands); } memset(new + kdb_max_commands, 0, kdb_command_extend * sizeof(*new)); kdb_commands = new; - kp = kdb_commands + kdb_max_commands; + kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; kdb_max_commands += kdb_command_extend; } diff --git a/kernel/exit.c b/kernel/exit.c index e2bdf37f9fd..676149a4ac5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,7 @@ #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> +#include <linux/oom.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk) sig->tty = NULL; } else { /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + + /* * If there is any task waiting for the group exit * then notify it: */ @@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk) enter_lazy_tlb(mm, current); /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); @@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk) * space. */ static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *thread; @@ -901,6 +914,15 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + tracehook_report_exit(&code); validate_creds_for_do_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408..5447dc7defa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -65,6 +65,7 @@ #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> +#include <linux/oom.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -272,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); + clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ @@ -488,6 +490,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->cached_hole_size = ~0UL; mm_init_aio(mm); mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -741,6 +744,8 @@ good_mm: /* Initializing for Swap token stuff */ mm->token_priority = 0; mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); tsk->mm = mm; tsk->active_mm = mm; @@ -904,6 +909,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + mutex_init(&sig->cred_guard_mutex); + return 0; } @@ -1299,8 +1306,13 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -1693,6 +1705,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) active_mm = current->active_mm; current->mm = new_mm; current->active_mm = new_mm; + if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + atomic_dec(&mm->oom_disable_count); + atomic_inc(&new_mm->oom_disable_count); + } activate_mm(active_mm, new_mm); new_mm = mm; } diff --git a/kernel/futex.c b/kernel/futex.c index a118bf160e0..40a8777a27d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - atomic_inc(&key->shared.inode->i_count); + ihold(key->shared.inode); break; case FUT_OFF_MMSHARED: atomic_inc(&key->private.mm->mm_count); @@ -2489,7 +2489,8 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); unsigned long futex_offset; int rc; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 06da4dfc339..a7934ac75e5 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -49,7 +49,8 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *next_entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); compat_uptr_t uentry, next_uentry, upending; compat_long_t futex_offset; int rc; diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2c9120f0afc..e5325825aeb 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -620,7 +620,7 @@ static struct pmu perf_breakpoint = { .read = hw_breakpoint_pmu_read, }; -static int __init init_hw_breakpoint(void) +int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; int cpu, err_cpu; @@ -655,6 +655,5 @@ static int __init init_hw_breakpoint(void) return -ENOMEM; } -core_initcall(init_hw_breakpoint); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9d917ff7267..9988d03797f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) struct irq_desc *desc = irq_to_desc(irq); return desc ? desc->kstat_irqs[cpu] : 0; } + +#ifdef CONFIG_GENERIC_HARDIRQS +unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + int cpu; + int sum = 0; + + if (!desc) + return 0; + for_each_possible_cpu(cpu) + sum += desc->kstat_irqs[cpu]; + return sum; +} +#endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa36..5f92acc5f95 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -324,6 +324,10 @@ void enable_irq(unsigned int irq) if (!desc) return; + if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + return; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 01b1d3a8898..6c8a2a9f8a7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v) static int irq_spurious_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_spurious_proc_show, NULL); + return single_open(file, irq_spurious_proc_show, PDE(inode)->data); } static const struct file_operations irq_spurious_proc_fops = { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f16763ff848..90f881904bb 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -145,7 +145,9 @@ void irq_work_run(void) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + (void)cmpxchg(&entry->next, + next_flags(NULL, IRQ_WORK_BUSY), + NULL); } } EXPORT_SYMBOL_GPL(irq_work_run); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7be868bf25c..3b79bd93833 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -39,6 +39,16 @@ struct jump_label_module_entry { struct module *mod; }; +void jump_label_lock(void) +{ + mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ + mutex_unlock(&jump_label_mutex); +} + static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; @@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type) struct jump_label_module_entry *e_module; int count; - mutex_lock(&jump_label_mutex); + jump_label_lock(); entry = get_jump_label_entry((jump_label_t)key); if (entry) { count = entry->nr_entries; @@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type) count = e_module->nr_entries; iter = e_module->table; while (count--) { - if (kernel_text_address(iter->code)) + if (iter->key && + kernel_text_address(iter->code)) arch_jump_label_transform(iter, type); iter++; } } } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); } static int addr_conflict(struct jump_entry *entry, void *start, void *end) @@ -231,6 +242,7 @@ out: * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ @@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end) struct jump_entry *iter_stop = __start___jump_table; int conflict = 0; - mutex_lock(&jump_label_mutex); iter = iter_start; while (iter < iter_stop) { if (addr_conflict(iter, start, end)) { @@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end) conflict = module_conflict(start, end); #endif out: - mutex_unlock(&jump_label_mutex); return conflict; } +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} + static __init int init_jump_label(void) { int ret; @@ -267,7 +284,7 @@ static __init int init_jump_label(void) struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = build_jump_label_hashtable(__start___jump_table, __stop___jump_table); iter = iter_start; @@ -275,7 +292,7 @@ static __init int init_jump_label(void) arch_jump_label_text_poke_early(iter->code); iter++; } - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); return ret; } early_initcall(init_jump_label); @@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod) } } +static void remove_jump_label_module_init(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod != mod) + continue; + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (within_module_init(iter->code, mod)) + iter->key = 0; + iter++; + } + } + } + } +} + static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) @@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, switch (val) { case MODULE_STATE_COMING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); ret = add_jump_label_module(mod); if (ret) remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); break; case MODULE_STATE_GOING: - mutex_lock(&jump_label_mutex); + jump_label_lock(); remove_jump_label_module(mod); - mutex_unlock(&jump_label_mutex); + jump_label_unlock(); + break; + case MODULE_STATE_LIVE: + jump_label_lock(); + remove_jump_label_module_init(mod); + jump_label_unlock(); break; } return ret; diff --git a/kernel/kexec.c b/kernel/kexec.c index c0613f7d673..b55045bc756 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image, ptr = kmap(page); /* Start with a clear page */ - memset(ptr, 0, PAGE_SIZE); + clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); if (mchunk > mbytes) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 56a89191427..9737a76e106 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; -static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { spinlock_t lock ____cacheline_aligned_in_smp; @@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) } #ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; @@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void) return; kprobes_allow_optimization = true; - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) optimize_kprobe(p); } - mutex_unlock(&text_mutex); printk(KERN_INFO "Kprobes globally optimized\n"); } +/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; @@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p) if (ret) return ret; + jump_label_lock(); preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) { - preempt_enable(); - return -EINVAL; - } + jump_label_text_reserved(p->addr, p->addr)) + goto fail_with_jump_label; /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p) * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) { - preempt_enable(); - return -EINVAL; - } + if (unlikely(!try_module_get(probed_mod))) + goto fail_with_jump_label; + /* * If the module freed .init.text, we couldn't insert * kprobes in there. @@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - preempt_enable(); - return -EINVAL; + goto fail_with_jump_label; } } preempt_enable(); + jump_label_unlock(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + jump_label_lock(); /* needed to call jump_label_text_reserved() */ + get_online_cpus(); /* For avoiding text_mutex deadlock. */ mutex_lock(&text_mutex); @@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p) out: mutex_unlock(&text_mutex); put_online_cpus(); + jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) module_put(probed_mod); return ret; + +fail_with_jump_label: + preempt_enable(); + jump_label_unlock(); + return -EINVAL; } EXPORT_SYMBOL_GPL(register_kprobe); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d..ca61bbdd44b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -265,6 +265,17 @@ int kthreadd(void *unused) return 0; } +void __init_kthread_worker(struct kthread_worker *worker, + const char *name, + struct lock_class_key *key) +{ + spin_lock_init(&worker->lock); + lockdep_set_class_and_name(&worker->lock, key, name); + INIT_LIST_HEAD(&worker->work_list); + worker->task = NULL; +} +EXPORT_SYMBOL_GPL(__init_kthread_worker); + /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 877fb306d41..17110a4a4fc 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) account_global_scheduler_latency(tsk, &lat); - /* - * short term hack; if we're > 32 we stop; future we recycle: - */ - tsk->latency_record_count++; - if (tsk->latency_record_count >= LT_SAVECOUNT) - goto out_unlock; - - for (i = 0; i < LT_SAVECOUNT; i++) { + for (i = 0; i < tsk->latency_record_count; i++) { struct latency_record *mylat; int same = 1; @@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) } } + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + /* Allocated a new one: */ - i = tsk->latency_record_count; + i = tsk->latency_record_count++; memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: diff --git a/kernel/module.c b/kernel/module.c index 2df46301a7a..d190664f25f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info) { } -static void add_kallsyms(struct module *mod, struct load_info *info) +static void add_kallsyms(struct module *mod, const struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ @@ -2326,6 +2326,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * mod->num_trace_events, GFP_KERNEL); #endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_bprintk_fmt_start, + sizeof(*mod->trace_bprintk_fmt_start) * + mod->num_trace_bprintk_fmt, GFP_KERNEL); +#endif #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, "__mcount_loc", diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 2a5dfec8efe..2c98ad94ba0 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, return ERR_PTR(-EPERM); if (!cgroup_is_descendant(cgroup, current)) return ERR_PTR(-EPERM); + if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { + printk("ns_cgroup can't be created with parent " + "'clone_children' set.\n"); + return ERR_PTR(-EINVAL); + } + + printk_once("ns_cgroup deprecated: consider using the " + "'clone_children' flag without the ns_cgroup.\n"); ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); if (!ns_cgroup) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f309e8014c7..2870feee81d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,6 +31,7 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> +#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> @@ -417,8 +418,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static int -__event_sched_out(struct perf_event *event, +static void +event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -437,13 +438,14 @@ __event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return 0; + return; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } + event->tstamp_stopped = ctx->time; event->pmu->del(event, 0); event->oncpu = -1; @@ -452,19 +454,6 @@ __event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - return 1; -} - -static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret; - - ret = __event_sched_out(event, cpuctx, ctx); - if (ret) - event->tstamp_stopped = ctx->time; } static void @@ -664,7 +653,7 @@ retry: } static int -__event_sched_in(struct perf_event *event, +event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -684,6 +673,10 @@ __event_sched_in(struct perf_event *event, return -EAGAIN; } + event->tstamp_running += ctx->time - event->tstamp_stopped; + + event->shadow_ctx_time = ctx->time - ctx->timestamp; + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -694,35 +687,6 @@ __event_sched_in(struct perf_event *event, return 0; } -static inline int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - int ret = __event_sched_in(event, cpuctx, ctx); - if (ret) - return ret; - event->tstamp_running += ctx->time - event->tstamp_stopped; - return 0; -} - -static void -group_commit_event_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct perf_event *event; - u64 now = ctx->time; - - group_event->tstamp_running += now - group_event->tstamp_stopped; - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(event, &group_event->sibling_list, group_entry) { - event->tstamp_running += now - event->tstamp_stopped; - } -} - static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, @@ -730,19 +694,15 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu); - /* - * use __event_sched_in() to delay updating tstamp_running - * until the transaction is committed. In case of failure - * we will keep an unmodified tstamp_running which is a - * requirement to get correct timing information - */ - if (__event_sched_in(group_event, cpuctx, ctx)) { + if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); return -EAGAIN; } @@ -751,31 +711,42 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (__event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!pmu->commit_txn(pmu)) { - /* commit tstamp_running */ - group_commit_event_sched_in(group_event, cpuctx, ctx); + if (!pmu->commit_txn(pmu)) return 0; - } + group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. * - * use __event_sched_out() to avoid updating tstamp_stopped - * because the event never actually ran + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) - break; - __event_sched_out(event, cpuctx, ctx); + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } } - __event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); pmu->cancel_txn(pmu); @@ -1316,8 +1287,6 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); } @@ -1651,8 +1620,12 @@ static void rotate_ctx(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); - /* Rotate the first entry last of non-pinned groups */ - list_rotate_left(&ctx->flexible_groups); + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); raw_spin_unlock(&ctx->lock); } @@ -2264,11 +2237,6 @@ int perf_event_release_kernel(struct perf_event *event) raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); - mutex_lock(&event->owner->perf_event_mutex); - list_del_init(&event->owner_entry); - mutex_unlock(&event->owner->perf_event_mutex); - put_task_struct(event->owner); - free_event(event); return 0; @@ -2281,9 +2249,43 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { struct perf_event *event = file->private_data; + struct task_struct *owner; file->private_data = NULL; + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + return perf_event_release_kernel(event); } @@ -3428,7 +3430,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { u64 read_format = event->attr.read_format; u64 values[4]; @@ -3436,11 +3439,11 @@ static void perf_output_read_one(struct perf_output_handle *handle, values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = event->total_time_enabled + + values[n++] = enabled + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = event->total_time_running + + values[n++] = running + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) @@ -3453,7 +3456,8 @@ static void perf_output_read_one(struct perf_output_handle *handle, * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_event *event) + struct perf_event *event, + u64 enabled, u64 running) { struct perf_event *leader = event->group_leader, *sub; u64 read_format = event->attr.read_format; @@ -3463,10 +3467,10 @@ static void perf_output_read_group(struct perf_output_handle *handle, values[n++] = 1 + leader->nr_siblings; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; + values[n++] = enabled; if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; + values[n++] = running; if (leader != event) leader->pmu->read(leader); @@ -3491,13 +3495,35 @@ static void perf_output_read_group(struct perf_output_handle *handle, } } +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + if (event->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, event); + perf_output_read_group(handle, event, enabled, running); else - perf_output_read_one(handle, event); + perf_output_read_one(handle, event, enabled, running); } void perf_output_sample(struct perf_output_handle *handle, @@ -3798,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_task_ctx(&cpuctx->ctx, task_event); ctx = task_event->task_ctx; @@ -3933,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_comm_ctx(&cpuctx->ctx, comm_event); ctxn = pmu->task_ctx_nr; @@ -4118,6 +4148,8 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); @@ -4687,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event) break; } - if (event_id > PERF_COUNT_SW_MAX) + if (event_id >= PERF_COUNT_SW_MAX) return -ENOENT; if (!event->parent) { @@ -5119,20 +5151,36 @@ static void *find_pmu_context(int ctxn) return NULL; } -static void free_pmu_context(void * __percpu cpu_context) +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) { - struct pmu *pmu; + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + + if (cpuctx->active_pmu == old_pmu) + cpuctx->active_pmu = pmu; + } +} + +static void free_pmu_context(struct pmu *pmu) +{ + struct pmu *i; mutex_lock(&pmus_lock); /* * Like a real lame refcount. */ - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->pmu_cpu_context == cpu_context) + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); goto out; + } } - free_percpu(cpu_context); + free_percpu(pmu->pmu_cpu_context); out: mutex_unlock(&pmus_lock); } @@ -5164,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu) cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; INIT_LIST_HEAD(&cpuctx->rotation_list); + cpuctx->active_pmu = pmu; } got_cpu_context: @@ -5215,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_rcu(); free_percpu(pmu->pmu_disable_count); - free_pmu_context(pmu->pmu_cpu_context); + free_pmu_context(pmu); } struct pmu *perf_init_event(struct perf_event *event) @@ -5683,7 +5732,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); event->owner = current; - get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -5751,12 +5800,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, ++ctx->generation; mutex_unlock(&ctx->mutex); - event->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_event_mutex); - list_add_tail(&event->owner_entry, ¤t->perf_event_list); - mutex_unlock(¤t->perf_event_mutex); - return event; err_free: @@ -5907,8 +5950,24 @@ again: */ void perf_event_exit_task(struct task_struct *child) { + struct perf_event *event, *tmp; int ctxn; + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); } @@ -6128,6 +6187,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; + unsigned long flags; int ret = 0; child->perf_event_ctxp[ctxn] = NULL; @@ -6168,6 +6228,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { ret = inherit_task_group(event, parent, parent_ctx, child, ctxn, &inherited_all); @@ -6175,6 +6244,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn) break; } + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { @@ -6327,6 +6400,8 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) void __init perf_event_init(void) { + int ret; + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent); @@ -6334,4 +6409,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_task_clock); perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index c7a8f453919..aeaa7f84682 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -121,10 +121,10 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) switch (o->type) { case PM_QOS_MIN: - return plist_last(&o->requests)->prio; + return plist_first(&o->requests)->prio; case PM_QOS_MAX: - return plist_first(&o->requests)->prio; + return plist_last(&o->requests)->prio; default: /* runtime check for not using enum */ diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 6842eeba587..05bb7173850 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -37,13 +37,13 @@ static int check_clock(const clockid_t which_clock) if (pid == 0) return 0; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_vpid(pid); if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? - same_thread_group(p, current) : thread_group_leader(p))) { + same_thread_group(p, current) : has_group_leader_pid(p))) { error = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return error; } @@ -390,7 +390,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) INIT_LIST_HEAD(&new_timer->it.cpu.entry); - read_lock(&tasklist_lock); + rcu_read_lock(); if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { if (pid == 0) { p = current; @@ -404,7 +404,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) p = current->group_leader; } else { p = find_task_by_vpid(pid); - if (p && !thread_group_leader(p)) + if (p && !has_group_leader_pid(p)) p = NULL; } } @@ -414,7 +414,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) } else { ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 29bff6117ab..265729966ec 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -246,9 +241,13 @@ config PM_OPS depends on PM_SLEEP || PM_RUNTIME default y +config ARCH_HAS_OPP + bool + config PM_OPP bool "Operating Performance Point (OPP) Layer library" depends on PM + depends on ARCH_HAS_OPP ---help--- SOCs have a standard set of tuples consisting of frequency and voltage pairs that the device will support per voltage domain. This diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185..120a1582332 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0..048d0b51483 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -327,7 +327,6 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; - gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -339,7 +338,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -348,7 +347,10 @@ int hibernation_snapshot(int platform_mode) goto Recover_platform; error = create_image(platform_mode); - /* Control returns here after successful restore */ + /* + * Control returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ Resume_devices: /* We may need to release the preallocated image pages here. */ @@ -357,7 +359,10 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + resume_console(); Close: platform_end(platform_mode); @@ -452,17 +457,16 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; - gfp_t saved_mask; pm_prepare_console(); suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); pm_restore_console(); return error; @@ -476,7 +480,6 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; - gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -492,7 +495,6 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -536,7 +538,6 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); - set_gfp_allowed_mask(saved_mask); resume_console(); Close: @@ -646,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); } diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 1836db60bbb..00000000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include <linux/io.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/suspend.h> - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -void suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ac7eb109f19..0dac75ea445 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) src = kmap_atomic(s_page, KM_USER0); dst = kmap_atomic(d_page, KM_USER1); do_copy_page(dst, src); - kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel @@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) */ safe_copy_page(buffer, s_page); dst = kmap_atomic(d_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); } else { safe_copy_page(page_address(d_page), s_page); @@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle) memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); } else if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); + clear_page(buffer); pack_pfns(buffer, &orig_bm); } else { struct page *page; @@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle) void *kaddr; kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); + copy_page(buffer, kaddr); kunmap_atomic(kaddr, KM_USER0); handle->buffer = buffer; } else { @@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void) void *dst; dst = kmap_atomic(last_highmem_page, KM_USER0); - memcpy(dst, buffer, PAGE_SIZE); + copy_page(dst, buffer); kunmap_atomic(dst, KM_USER0); last_highmem_page = NULL; } @@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) kaddr1 = kmap_atomic(p1, KM_USER0); kaddr2 = kmap_atomic(p2, KM_USER1); - memcpy(buf, kaddr1, PAGE_SIZE); - memcpy(kaddr1, kaddr2, PAGE_SIZE); - memcpy(kaddr2, buf, PAGE_SIZE); - kunmap_atomic(kaddr1, KM_USER0); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); kunmap_atomic(kaddr2, KM_USER1); + kunmap_atomic(kaddr1, KM_USER0); } /** diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee47..ecf770509d0 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -197,7 +197,6 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; - gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -208,7 +207,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - saved_mask = clear_gfp_allowed_mask(GFP_IOFS); + pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -225,7 +224,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - set_gfp_allowed_mask(saved_mask); + pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 916eaa79039..8c7e4832b9b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,6 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * @@ -29,7 +30,7 @@ #include "power.h" -#define HIBERNATE_SIG "LINHIB0001" +#define HIBERNATE_SIG "S1SUSPEND" /* * The swap map is a data structure used for keeping track of each page @@ -251,7 +252,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (bio_chain) { src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (src) { - memcpy(src, buf, PAGE_SIZE); + copy_page(src, buf); } else { WARN_ON_ONCE(1); bio_chain = NULL; /* Go synchronous */ @@ -325,7 +326,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, error = write_page(handle->cur, handle->cur_swap, NULL); if (error) goto out; - memset(handle->cur, 0, PAGE_SIZE); + clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } @@ -753,30 +754,43 @@ static int load_image_lzo(struct swap_map_handle *handle, { unsigned int m; int error = 0; + struct bio *bio; struct timeval start; struct timeval stop; unsigned nr_pages; - size_t off, unc_len, cmp_len; - unsigned char *unc, *cmp, *page; + size_t i, off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - return -ENOMEM; + for (i = 0; i < LZO_CMP_PAGES; i++) { + page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page[i]) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + + while (i) + free_page((unsigned long)page[--i]); + + return -ENOMEM; + } } unc = vmalloc(LZO_UNC_SIZE); if (!unc) { printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - free_page((unsigned long)page); + + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + return -ENOMEM; } cmp = vmalloc(LZO_CMP_SIZE); if (!cmp) { printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); - free_page((unsigned long)page); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + return -ENOMEM; } @@ -787,6 +801,7 @@ static int load_image_lzo(struct swap_map_handle *handle, if (!m) m = 1; nr_pages = 0; + bio = NULL; do_gettimeofday(&start); error = snapshot_write_next(snapshot); @@ -794,11 +809,11 @@ static int load_image_lzo(struct swap_map_handle *handle, goto out_finish; for (;;) { - error = swap_read_page(handle, page, NULL); /* sync */ + error = swap_read_page(handle, page[0], NULL); /* sync */ if (error) break; - cmp_len = *(size_t *)page; + cmp_len = *(size_t *)page[0]; if (unlikely(!cmp_len || cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { printk(KERN_ERR "PM: Invalid LZO compressed length\n"); @@ -806,13 +821,20 @@ static int load_image_lzo(struct swap_map_handle *handle, break; } - memcpy(cmp, page, PAGE_SIZE); - for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { - error = swap_read_page(handle, page, NULL); /* sync */ + for (off = PAGE_SIZE, i = 1; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + error = swap_read_page(handle, page[i], &bio); if (error) goto out_finish; + } - memcpy(cmp + off, page, PAGE_SIZE); + error = hib_wait_on_bio_chain(&bio); /* need all data now */ + if (error) + goto out_finish; + + for (off = 0, i = 0; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + memcpy(cmp + off, page[i], PAGE_SIZE); } unc_len = LZO_UNC_SIZE; @@ -857,7 +879,8 @@ out_finish: vfree(cmp); vfree(unc); - free_page((unsigned long)page); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); return error; } @@ -910,7 +933,7 @@ int swsusp_check(void) hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); - memset(swsusp_header, 0, PAGE_SIZE); + clear_page(swsusp_header); error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) diff --git a/kernel/power/user.c b/kernel/power/user.c index e819e17877c..c36c3b9e8a8 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_all_swap_pages(data->swap); if (data->frozen) thaw_processes(); - pm_notifier_call_chain(data->mode == O_WRONLY ? + pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); @@ -263,6 +263,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; + pm_restore_gfp_mask(); thaw_processes(); usermodehelper_enable(); data->frozen = 0; @@ -275,6 +276,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = -EPERM; break; } + pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) error = put_user(in_suspend, (int __user *)arg); diff --git a/kernel/printk.c b/kernel/printk.c index 2531017795f..a23315dc449 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY -static unsigned int boot_delay; /* msecs delay after each printk during bootup */ +static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) @@ -261,6 +261,12 @@ static inline void boot_delay_msec(void) } #endif +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) char c; int error = 0; - error = security_syslog(type, from_file); + /* + * If this is from /proc/kmsg we only do the capabilities checks + * at open time. + */ + if (type == SYSLOG_ACTION_OPEN || !from_file) { + if (dmesg_restrict && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + error = security_syslog(type); if (error) return error; @@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu) * released but interrupts still disabled. */ static int acquire_console_semaphore_for_printk(unsigned int cpu) + __releases(&logbuf_lock) { int retval = 0; @@ -1062,13 +1082,15 @@ void printk_tick(void) int printk_needs_cpu(int cpu) { + if (unlikely(cpu_is_offline(cpu))) + printk_tick(); return per_cpu(printk_pending, cpu); } void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - __raw_get_cpu_var(printk_pending) = 1; + this_cpu_write(printk_pending, 1); } /** @@ -1511,7 +1533,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char const *kmsg_reasons[] = { +static const char * const kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", [KMSG_DUMP_KEXEC] = "kexec", diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f34d798ef4a..99bbaa3e5b0 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task) * under ptrace. */ retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(&task->cred_guard_mutex)) + if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) goto out; task_lock(task); @@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task) unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: - mutex_unlock(&task->cred_guard_mutex); + mutex_unlock(&task->signal->cred_guard_mutex); out: return retval; } @@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data) * and reacquire the lock. */ void exit_ptrace(struct task_struct *tracer) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) { struct task_struct *p, *n; LIST_HEAD(ptrace_dead); @@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds return copied; } -static int ptrace_setoptions(struct task_struct *child, long data) +static int ptrace_setoptions(struct task_struct *child, unsigned long data) { child->ptrace &= ~PT_TRACE_MASK; @@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) #define is_sysemu_singlestep(request) 0 #endif -static int ptrace_resume(struct task_struct *child, long request, long data) +static int ptrace_resume(struct task_struct *child, long request, + unsigned long data) { if (!valid_signal(data)) return -EIO; @@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, #endif int ptrace_request(struct task_struct *child, long request, - long addr, long data) + unsigned long addr, unsigned long data) { int ret = -EIO; siginfo_t siginfo; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; switch (request) { case PTRACE_PEEKTEXT: @@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setoptions(child, data); break; case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message, (unsigned long __user *) data); + ret = put_user(child->ptrace_message, datalp); break; case PTRACE_GETSIGINFO: ret = ptrace_getsiginfo(child, &siginfo); if (!ret) - ret = copy_siginfo_to_user((siginfo_t __user *) data, - &siginfo); + ret = copy_siginfo_to_user(datavp, &siginfo); break; case PTRACE_SETSIGINFO: - if (copy_from_user(&siginfo, (siginfo_t __user *) data, - sizeof siginfo)) + if (copy_from_user(&siginfo, datavp, sizeof siginfo)) ret = -EFAULT; else ret = ptrace_setsiginfo(child, &siginfo); @@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request, } mmput(mm); - ret = put_user(tmp, (unsigned long __user *) data); + ret = put_user(tmp, datalp); break; } #endif @@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_SETREGSET: { struct iovec kiov; - struct iovec __user *uiov = (struct iovec __user *) data; + struct iovec __user *uiov = datavp; if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) return -EFAULT; @@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) +SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, + unsigned long, data) { struct task_struct *child; long ret; @@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) return ret; } -int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { unsigned long tmp; int copied; @@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) return put_user(tmp, (unsigned long __user *)data); } -int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) +int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, + unsigned long data) { int copied; diff --git a/kernel/range.c b/kernel/range.c index 471b66acabb..37fa9b99ad5 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2) int clean_sort_range(struct range *range, int az) { - int i, j, k = az - 1, nr_range = 0; + int i, j, k = az - 1, nr_range = az; for (i = 0; i < k; i++) { if (range[i].end) diff --git a/kernel/relay.c b/kernel/relay.c index c7cf397fb92..859ea5a9605 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - struct page **array; - size_t pa_size = n_pages * sizeof(struct page *); - - if (pa_size > PAGE_SIZE) { - array = vmalloc(pa_size); - if (array) - memset(array, 0, pa_size); - } else { - array = kzalloc(pa_size, GFP_KERNEL); - } - return array; + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); } /* diff --git a/kernel/resource.c b/kernel/resource.c index 7b36976e5de..798e2fae2a0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -357,6 +357,32 @@ int __weak page_is_ram(unsigned long pfn) return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +void __weak arch_remove_reservations(struct resource *avail) +{ +} + +static resource_size_t simple_align_resource(void *data, + const struct resource *avail, + resource_size_t size, + resource_size_t align) +{ + return avail->start; +} + +static void resource_clip(struct resource *res, resource_size_t min, + resource_size_t max) +{ + if (res->start < min) + res->start = min; + if (res->end > max) + res->end = max; +} + +static bool resource_contains(struct resource *res1, struct resource *res2) +{ + return res1->start <= res2->start && res1->end >= res2->end; +} + /* * Find empty slot in the resource tree given range and alignment. */ @@ -370,8 +396,9 @@ static int find_resource(struct resource *root, struct resource *new, void *alignf_data) { struct resource *this = root->child; - struct resource tmp = *new; + struct resource tmp = *new, avail, alloc; + tmp.flags = new->flags; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment @@ -386,17 +413,22 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = this->start - 1; else tmp.end = root->end; - if (tmp.start < min) - tmp.start = min; - if (tmp.end > max) - tmp.end = max; - tmp.start = ALIGN(tmp.start, align); - if (alignf) - tmp.start = alignf(alignf_data, &tmp, size, align); - if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { - new->start = tmp.start; - new->end = tmp.start + size - 1; - return 0; + + resource_clip(&tmp, min, max); + arch_remove_reservations(&tmp); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = alignf(alignf_data, &avail, size, align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } } if (!this) break; @@ -428,6 +460,9 @@ int allocate_resource(struct resource *root, struct resource *new, { int err; + if (!alignf) + alignf = simple_align_resource; + write_lock(&resource_lock); err = find_resource(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) @@ -453,6 +488,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou if (first == parent) return first; + if (WARN_ON(first == new)) /* duplicated insertion */ + return first; if ((first->start > new->start) || (first->end < new->end)) break; diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdf..297d1a0eedb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -560,18 +560,8 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) -{ - rq->curr->sched_class->check_preempt_curr(rq, p, flags); - /* - * A queue event has occurred, and we're going to schedule. In - * this case, we can save a useless back to back clock update. - */ - if (test_tsk_need_resched(p)) - rq->skip_clock_update = 1; -} +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); static inline int cpu_of(struct rq *rq) { @@ -646,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ -static u64 irq_time_cpu(int cpu); -static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); +static void update_rq_clock_task(struct rq *rq, s64 delta); -inline void update_rq_clock(struct rq *rq) +static void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) { - int cpu = cpu_of(rq); - u64 irq_time; + s64 delta; - rq->clock = sched_clock_cpu(cpu); - irq_time = irq_time_cpu(cpu); - if (rq->clock - irq_time > rq->clock_task) - rq->clock_task = rq->clock - irq_time; + if (rq->skip_clock_update) + return; - sched_irq_time_avg_update(rq, irq_time); - } + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); } /* @@ -1934,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's irq time and can * race with irq/account_system_vtime on this CPU. We would either get old - * or new value (or semi updated value on 32 bit) with a side effect of - * accounting a slice of irq time to wrong task when irq is in progress - * while we read rq->clock. That is a worthy compromise in place of having - * locks on each irq in account_system_time. + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. */ static DEFINE_PER_CPU(u64, cpu_hardirq_time); static DEFINE_PER_CPU(u64, cpu_softirq_time); @@ -1955,19 +1940,58 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } -static u64 irq_time_cpu(int cpu) +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) { - if (!sched_clock_irqtime) - return 0; + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); } +#endif /* CONFIG_64BIT */ +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ void account_system_vtime(struct task_struct *curr) { unsigned long flags; + s64 delta; int cpu; - u64 now, delta; if (!sched_clock_irqtime) return; @@ -1975,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); - now = sched_clock_cpu(cpu); - delta = now - per_cpu(irq_start_time, cpu); - per_cpu(irq_start_time, cpu) = now; + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread @@ -1985,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr) * that do not consume any time, but still wants to run. */ if (hardirq_count()) - per_cpu(cpu_hardirq_time, cpu) += delta; + __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) - per_cpu(cpu_softirq_time, cpu) += delta; + __this_cpu_add(cpu_softirq_time, delta); + irq_time_write_end(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(account_system_vtime); -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +static void update_rq_clock_task(struct rq *rq, s64 delta) { - if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { - u64 delta_irq = curr_irq_time - rq->prev_irq_time; - rq->prev_irq_time = curr_irq_time; - sched_rt_avg_update(rq, delta_irq); - } + s64 irq_delta; + + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + rq->clock_task += delta; + + if (irq_delta && sched_feat(NONIRQ_POWER)) + sched_rt_avg_update(rq, irq_delta); } -#else +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ -static u64 irq_time_cpu(int cpu) +static void update_rq_clock_task(struct rq *rq, s64 delta) { - return 0; + rq->clock_task += delta; } -static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } - -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #include "sched_idletask.c" #include "sched_fair.c" @@ -2118,6 +2165,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ + const struct sched_class *class; + + if (p->sched_class == rq->curr->sched_class) { + rq->curr->sched_class->check_preempt_curr(rq, p, flags); + } else { + for_each_class(class) { + if (class == rq->curr->sched_class) + break; + if (class == p->sched_class) { + resched_task(rq->curr); + break; + } + } + } + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + rq->skip_clock_update = 1; +} + #ifdef CONFIG_SMP /* * Is this task likely cache-hot: @@ -3104,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + #ifdef CONFIG_NO_HZ /* * For NO_HZ we delay the active fold to the next LOAD_FREQ update. @@ -3133,6 +3214,128 @@ static long calc_load_fold_idle(void) return delta; } + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ + long delta, active, n; + + if (time_before(jiffies, calc_load_update)) + return; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * If we were idle for multiple load cycles, apply them. + */ + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ +} #else static void calc_load_account_idle(struct rq *this_rq) { @@ -3142,6 +3345,10 @@ static inline long calc_load_fold_idle(void) { return 0; } + +static void calc_global_nohz(unsigned long ticks) +{ +} #endif /** @@ -3159,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; -} - /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ -void calc_global_load(void) +void calc_global_load(unsigned long ticks) { - unsigned long upd = calc_load_update + 10; long active; - if (time_before(jiffies, upd)) + calc_global_nohz(ticks); + + if (time_before(jiffies, calc_load_update + 10)) return; active = atomic_long_read(&calc_load_tasks); @@ -3830,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) { if (prev->se.on_rq) update_rq_clock(rq); - rq->skip_clock_update = 0; prev->sched_class->put_prev_task(rq, prev); } @@ -3888,7 +4087,6 @@ need_resched_nonpreemptible: hrtick_clear(rq); raw_spin_lock_irq(&rq->lock); - clear_tsk_need_resched(prev); switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { @@ -3920,6 +4118,8 @@ need_resched_nonpreemptible: put_prev_task(rq, prev); next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; if (likely(prev != next)) { sched_info_switch(prev, next); @@ -6960,6 +7160,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (cpu != group_first_cpu(sd->groups)) return; + sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); + child = sd->child; sd->groups->cpu_power = 0; @@ -8510,12 +8712,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - set_task_rq(tsk, task_cpu(tsk)); - #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk, on_rq); + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else #endif + set_task_rq(tsk, task_cpu(tsk)); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 933f3d1b62e..00ebd768667 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; - if (unlikely(rt_prio(p->prio))) - goto preempt; - - if (unlikely(p->sched_class != &fair_sched_class)) - return; - if (unlikely(se == pse)) return; @@ -1764,10 +1758,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); - - /* re-arm NEWIDLE balancing when moving tasks */ - src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; - this_rq->idle_stamp = 0; } /* @@ -2035,13 +2025,16 @@ struct sd_lb_stats { unsigned long this_load_per_task; unsigned long this_nr_running; unsigned long this_has_capacity; + unsigned int this_idle_cpus; /* Statistics of the busiest group */ + unsigned int busiest_idle_cpus; unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; unsigned long busiest_has_capacity; + unsigned int busiest_group_weight; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2063,6 +2056,8 @@ struct sg_lb_stats { unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; + unsigned long idle_cpus; + unsigned long group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ }; @@ -2431,7 +2426,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_load += load; sgs->sum_nr_running += rq->nr_running; sgs->sum_weighted_load += weighted_cpuload(i); - + if (idle_cpu(i)) + sgs->idle_cpus++; } /* @@ -2469,6 +2465,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) sgs->group_has_capacity = 1; @@ -2576,13 +2573,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; sds->this_has_capacity = sgs.group_has_capacity; + sds->this_idle_cpus = sgs.idle_cpus; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_idle_cpus = sgs.idle_cpus; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->busiest_has_capacity = sgs.group_has_capacity; + sds->busiest_group_weight = sgs.group_weight; sds->group_imb = sgs.group_imb; } @@ -2860,8 +2860,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; + /* + * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. + * And to check for busy balance use !idle_cpu instead of + * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE + * even when they are idle. + */ + if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; + } else { + /* + * This cpu is idle. If the busiest group load doesn't + * have more tasks than the number of available cpu's and + * there is no imbalance between this and busiest group + * wrt to idle cpu's, it is balanced. + */ + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + sds.busiest_nr_running <= sds.busiest_group_weight) + goto out_balanced; + } force_balance: /* Looks like there is an imbalance. Compute it */ @@ -3197,8 +3215,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } raw_spin_lock(&this_rq->lock); @@ -3869,13 +3889,26 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p, int on_rq) +static void task_move_group_fair(struct task_struct *p, int on_rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(p); - - update_curr(cfs_rq); + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); if (!on_rq) - place_entity(cfs_rq, &p->se, 1); + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } #endif @@ -3927,7 +3960,7 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .moved_group = moved_group_fair, + .task_move_group = task_move_group_fair, #endif }; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 25c2f962f6f..48ddf431db0 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) } /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * Though we are interested in knowing how long it was from the *first* time a + * We are interested in knowing how long it was from the *first* time a * task was queued to the time that it finally hit a cpu, we call this routine * from dequeue_task() to account for possible rq->clock skew across cpus. The * delta taken on each cpu would annul the skew. @@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t) } /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 45bddc0c104..2bf6b47058c 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p, static void check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) { - resched_task(rq->curr); /* we preempt everything */ + /* we're never preempted */ } static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->state == TASK_RUNNING) + if (stop && stop->se.on_rq) return stop; return NULL; diff --git a/kernel/signal.c b/kernel/signal.c index 919562c3d6b..4e3cff10fdc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p) return count; } -struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) +struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) { struct sighand_struct *sighand; @@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk) * is gone, we keep current->exit_code unless clear_code. */ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) + __releases(¤t->sighand->siglock) + __acquires(¤t->sighand->siglock) { if (arch_ptrace_stop_needed(exit_code, info)) { /* diff --git a/kernel/smp.c b/kernel/smp.c index ed6aacfcb7e..12ed8b013e2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); * * Returns 0 on success, else a negative status code. */ -int smp_call_function_single(int cpu, void (*func) (void *info), void *info, +int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { struct call_single_data d = { @@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single); * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, - void (*func)(void *info), void *info, int wait) + smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; @@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *), void *info, bool wait) + smp_call_func_t func, void *info, bool wait) { struct call_function_data *data; unsigned long flags; @@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(void (*func)(void *), void *info, int wait) +int smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); diff --git a/kernel/softirq.c b/kernel/softirq.c index f02a9dfa19b..18f4be0d5fe 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -229,18 +229,20 @@ restart: do { if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - trace_softirq_entry(h, softirq_vec); + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); h->action(h); - trace_softirq_exit(h, softirq_vec); + trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" + printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); preempt_count() = prev_count; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 090c28812ce..2df820b03be 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -262,7 +262,7 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN]; + char ksym_buf[KSYM_NAME_LEN] __maybe_unused; __set_current_state(TASK_RUNNING); @@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", cpu); if (IS_ERR(p)) - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); kthread_bind(p, cpu); sched_set_stop_task(cpu, p); @@ -372,7 +372,7 @@ static int __init cpu_stop_init(void) /* start one for the boot cpu */ err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, bcpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); register_cpu_notifier(&cpu_stop_cpu_notifier); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3a45c224770..5abfa151855 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,8 +161,6 @@ extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif -extern struct ratelimit_state printk_ratelimit_state; - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -704,6 +702,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &ten_thousand, }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "ngroups_max", @@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = { .data = &inodes_stat, .maxlen = 2*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, .maxlen = 7*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_inodes, }, { .procname = "file-nr", .data = &files_stat, - .maxlen = 3*sizeof(int), + .maxlen = sizeof(files_stat), .mode = 0444, .proc_handler = proc_nr_files, }, { .procname = "file-max", .data = &files_stat.max_files, - .maxlen = sizeof(int), + .maxlen = sizeof(files_stat.max_files), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "nr_open", @@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = { .data = &dentry_stat, .maxlen = 6*sizeof(int), .mode = 0444, - .proc_handler = proc_dointvec, + .proc_handler = proc_nr_dentry, }, { .procname = "overflowuid", diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 11281d5792b..3308fd7f1b5 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb, up_write(&listeners->sem); } -static int fill_pid(pid_t pid, struct task_struct *tsk, - struct taskstats *stats) +static void fill_stats(struct task_struct *tsk, struct taskstats *stats) { - int rc = 0; - - if (!tsk) { - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return -ESRCH; - } else - get_task_struct(tsk); - memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to @@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); +} - /* Define err: label here if needed */ - put_task_struct(tsk); - return rc; +static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) +{ + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return -ESRCH; + fill_stats(tsk, stats); + put_task_struct(tsk); + return 0; } -static int fill_tgid(pid_t tgid, struct task_struct *first, - struct taskstats *stats) +static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) { - struct task_struct *tsk; + struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; @@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, * leaders who are already counted with the dead tasks */ rcu_read_lock(); - if (!first) - first = find_task_by_vpid(tgid); + first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; @@ -268,7 +263,6 @@ out: return rc; } - static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; @@ -355,6 +349,10 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } +#ifdef CONFIG_IA64 +#define TASKSTATS_NEEDS_PADDING 1 +#endif + static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; @@ -364,9 +362,33 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; + /* + * The taskstats structure is internally aligned on 8 byte + * boundaries but the layout of the aggregrate reply, with + * two NLA headers and the pid (each 4 bytes), actually + * force the entire structure to be unaligned. This causes + * the kernel to issue unaligned access warnings on some + * architectures like ia64. Unfortunately, some software out there + * doesn't properly unroll the NLA packet and assumes that the start + * of the taskstats structure will always be 20 bytes from the start + * of the netlink payload. Aligning the start of the taskstats + * structure breaks this software, which we don't want. So, for now + * the alignment only happens on architectures that require it + * and those users will have to update to fixed versions of those + * packages. Space is reserved in the packet only when needed. + * This ifdef should be removed in several years e.g. 2012 once + * we can be confident that fixed versions are installed on most + * systems. We add the padding before the aggregate since the + * aggregate is already a defined type. + */ +#ifdef TASKSTATS_NEEDS_PADDING + if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) + goto err; +#endif na = nla_nest_start(skb, aggr); if (!na) goto err; + if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); @@ -424,74 +446,122 @@ err: return rc; } -static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +static int cmd_attr_register_cpumask(struct genl_info *info) { - int rc; - struct sk_buff *rep_skb; - struct taskstats *stats; - size_t size; cpumask_var_t mask; + int rc; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; - rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, REGISTER); - goto free_return_rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, REGISTER); +out: + free_cpumask_var(mask); + return rc; +} + +static int cmd_attr_deregister_cpumask(struct genl_info *info) +{ + cpumask_var_t mask; + int rc; + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) - goto free_return_rc; - if (rc == 0) { - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); -free_return_rc: - free_cpumask_var(mask); - return rc; - } + goto out; + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +out: free_cpumask_var(mask); + return rc; +} + +static size_t taskstats_packet_size(void) +{ + size_t size; - /* - * Size includes space for nested attributes - */ size = nla_total_size(sizeof(u32)) + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); +#ifdef TASKSTATS_NEEDS_PADDING + size += nla_total_size(0); /* Padding for alignment */ +#endif + return size; +} + +static int cmd_attr_pid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 pid; + int rc; + + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return rc; rc = -EINVAL; - if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { - u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); - if (!stats) - goto err; - - rc = fill_pid(pid, NULL, stats); - if (rc < 0) - goto err; - } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { - u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); - stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); - if (!stats) - goto err; - - rc = fill_tgid(tgid, NULL, stats); - if (rc < 0) - goto err; - } else + pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) goto err; + rc = fill_stats_for_pid(pid, stats); + if (rc < 0) + goto err; return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; } +static int cmd_attr_tgid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 tgid; + int rc; + + size = taskstats_packet_size(); + + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return rc; + + rc = -EINVAL; + tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) + goto err; + + rc = fill_stats_for_tgid(tgid, stats); + if (rc < 0) + goto err; + return send_reply(rep_skb, info); +err: + nlmsg_free(rep_skb); + return rc; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) + return cmd_attr_register_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) + return cmd_attr_deregister_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) + return cmd_attr_pid(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) + return cmd_attr_tgid(info); + else + return -EINVAL; +} + static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; @@ -532,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) /* * Size includes space for nested attributes */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { @@ -555,9 +624,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!stats) goto err; - rc = fill_pid(-1, tsk, stats); - if (rc < 0) - goto err; + fill_stats(tsk, stats); /* * Doesn't matter if tsk is the leader or the last group member leaving diff --git a/kernel/timer.c b/kernel/timer.c index 68a9ae7679b..353b9227c2e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now) struct tvec_base *base = __get_cpu_var(tvec_bases); unsigned long expires; + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return now + NEXT_TIMER_MAX_DELTA; spin_lock(&base->lock); if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); @@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks) { jiffies_64 += ticks; update_wall_time(); - calc_global_load(); + calc_global_load(ticks); } #ifdef __ARCH_WANT_SYS_ALARM diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e04b8bcdef8..ea37e2ff416 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -126,7 +126,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if (!ARM_UNWIND) + select FRAME_POINTER if !ARM_UNWIND && !S390 select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc251ed6672..7b8ec028154 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -168,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; -#define BLK_TC_HARDBARRIER BLK_TC_BARRIER #define BLK_TC_RAHEAD BLK_TC_AHEAD /* The ilog2() calls fall out because they're constant */ @@ -196,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, HARDBARRIER); what |= MASK_TC_BIT(rw, SYNC); what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); @@ -1807,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_HARDBARRIER) - rwbs[i++] = 'B'; if (rw & REQ_SYNC) rwbs[i++] = 'S'; if (rw & REQ_META) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c3dab054d18..bd1c35a4fbc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -224,6 +224,9 @@ enum { RB_LEN_TIME_STAMP = 16, }; +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + static inline int rb_null_event(struct ring_buffer_event *event) { return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; @@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event) return length + RB_EVNT_HDR_SIZE; } -/* inline for ring buffer fast paths */ -static unsigned +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned rb_event_length(struct ring_buffer_event *event) { switch (event->type_len) { @@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event) return 0; } +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + /** * ring_buffer_event_length - return the length of the event * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - unsigned length = rb_event_length(event); + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; @@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ if (event->type_len) @@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) -/* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) - int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) iter->head = 0; } +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + /** * ring_buffer_update_event - update event type and data * @event: the even to update @@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) * data field. */ static void -rb_update_event(struct ring_buffer_event *event, - unsigned type, unsigned length) +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) { - event->type_len = type; - - switch (type) { - - case RINGBUF_TYPE_PADDING: - case RINGBUF_TYPE_TIME_EXTEND: - case RINGBUF_TYPE_TIME_STAMP: - break; + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; - case 0: - length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) - event->array[0] = length; - else - event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); - break; - default: - BUG(); + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); } /* @@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } -static struct ring_buffer_event * +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length, unsigned long tail, - struct buffer_page *tail_page, u64 *ts) + struct buffer_page *tail_page, u64 ts) { struct buffer_page *commit_page = cpu_buffer->commit_page; struct ring_buffer *buffer = cpu_buffer->buffer; @@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer); - next_page->page->time_stamp = *ts; + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; } out_again: @@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) + unsigned long length, u64 ts, + u64 delta, int add_timestamp) { struct buffer_page *tail_page; struct ring_buffer_event *event; unsigned long tail, write; + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + tail_page = cpu_buffer->tail_page; write = local_add_return(length, &tail_page->write); @@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) + if (unlikely(write > BUF_PAGE_SIZE)) return rb_move_tail(cpu_buffer, length, tail, tail_page, ts); @@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); - rb_update_event(event, type, length); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + local_inc(&tail_page->entries); /* * If this is the first commit on the page, then update * its timestamp. */ if (!tail) - tail_page->page->time_stamp = *ts; + tail_page->page->time_stamp = ts; return event; } @@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long addr; new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); + old_index = new_index + rb_event_ts_length(event); addr = (unsigned long)event; addr &= PAGE_MASK; @@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, return 0; } -static int -rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, - u64 *ts, u64 *delta) -{ - struct ring_buffer_event *event; - int ret; - - WARN_ONCE(*delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, - (unsigned long long)*ts, - (unsigned long long)cpu_buffer->write_stamp); - - /* - * The delta is too big, we to add a - * new timestamp. - */ - event = __rb_reserve_next(cpu_buffer, - RINGBUF_TYPE_TIME_EXTEND, - RB_LEN_TIME_EXTEND, - ts); - if (!event) - return -EBUSY; - - if (PTR_ERR(event) == -EAGAIN) - return -EAGAIN; - - /* Only a commited time event can update the write stamp */ - if (rb_event_is_commit(cpu_buffer, event)) { - /* - * If this is the first on the page, then it was - * updated with the page itself. Try to discard it - * and if we can't just make it zero. - */ - if (rb_event_index(event)) { - event->time_delta = *delta & TS_MASK; - event->array[0] = *delta >> TS_SHIFT; - } else { - /* try to discard, since we do not need this */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* nope, just zero it */ - event->time_delta = 0; - event->array[0] = 0; - } - } - cpu_buffer->write_stamp = *ts; - /* let the caller know this was the commit */ - ret = 1; - } else { - /* Try to discard the event */ - if (!rb_try_to_discard(cpu_buffer, event)) { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - } - ret = 0; - } - - *delta = 0; - - return ret; -} - static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) { local_inc(&cpu_buffer->committing); local_inc(&cpu_buffer->commits); } -static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) { unsigned long commits; @@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta = 0; - int commit = 0; + u64 ts, delta; int nr_loops = 0; + int add_timestamp; + u64 diff; rb_start_commit(cpu_buffer); @@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, length = rb_calculate_event_length(length); again: + add_timestamp = 0; + delta = 0; + /* * We allow for interrupts to reenter here and do a trace. * If one does, it will cause this original code to loop @@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer, goto out_fail; ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; - /* - * Only the first commit can update the timestamp. - * Yes there is a race here. If an interrupt comes in - * just after the conditional and it traces too, then it - * will also check the deltas. More than one timestamp may - * also be made. But only the entry that did the actual - * commit will be something other than zero. - */ - if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer))) { - u64 diff; - - diff = ts - cpu_buffer->write_stamp; - - /* make sure this diff is calculated here */ - barrier(); - - /* Did the write stamp get updated already? */ - if (unlikely(ts < cpu_buffer->write_stamp)) - goto get_event; + /* make sure this diff is calculated here */ + barrier(); + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { - - commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) - goto out_fail; - - if (commit == -EAGAIN) - goto again; - - RB_WARN_ON(cpu_buffer, commit < 0); + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp); + add_timestamp = 1; } } - get_event: - event = __rb_reserve_next(cpu_buffer, 0, length, &ts); + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) goto out_fail; - if (!rb_event_is_commit(cpu_buffer, event)) - delta = 0; - - event->time_delta = delta; - return event; out_fail: @@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer, #define TRACE_RECURSIVE_DEPTH 16 -static int trace_recursive_lock(void) +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) { - current->trace_recursion++; - - if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) - return 0; - /* Disable all tracing before we do anything else */ tracing_off_permanent(); @@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void) in_nmi()); WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + current->trace_recursion++; + + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + return -1; } -static void trace_recursive_unlock(void) +static inline void trace_recursive_unlock(void) { WARN_ON_ONCE(!current->trace_recursion); @@ -2308,12 +2298,28 @@ static void rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { + u64 delta; + /* * The event first in the commit queue updates the * time stamp. */ - if (rb_event_is_commit(cpu_buffer, event)) - cpu_buffer->write_stamp += event->time_delta; + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } } static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, @@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + /* array[0] holds the actual length for the discarded event */ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; event->type_len = RINGBUF_TYPE_PADDING; @@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written, or from discarded - * commits. The most that we can have is the number on a single page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) return NULL; /* - * We repeat when a timestamp is encountered. - * We can get multiple timestamps by nested interrupts or also - * if filtering is on (discarding commits). Since discarding - * commits can be frequent we can get a lot of timestamps. - * But we limit them by not adding timestamps if they begin - * at the start of a page. + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) @@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (len > (commit - read)) len = (commit - read); - size = rb_event_length(event); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); if (len < size) goto out_unlock; @@ -3845,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* Need to copy one event at a time */ do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); memcpy(bpage->data + pos, rpage->data + rpos, size); len -= size; @@ -3857,8 +3872,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, break; event = rb_reader_event(cpu_buffer); - size = rb_event_length(event); - } while (len > size); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); /* update bpage */ local_set(&bpage->commit, pos); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 001bcd2ccf4..f8cf959bad4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,7 +17,6 @@ #include <linux/writeback.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include <linux/notifier.h> #include <linux/irqflags.h> #include <linux/debugfs.h> @@ -1284,6 +1283,8 @@ void trace_dump_stack(void) __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); } +static DEFINE_PER_CPU(int, user_stack_count); + void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { @@ -1302,6 +1303,18 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (unlikely(in_nmi())) return; + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) @@ -1319,6 +1332,11 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) save_stack_trace_user(&trace); if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + + __this_cpu_dec(user_stack_count); + + out: + preempt_enable(); } #ifdef UNUSED @@ -2320,11 +2338,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf, return count; } +static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +{ + if (file->f_mode & FMODE_READ) + return seq_lseek(file, offset, origin); + else + return 0; +} + static const struct file_operations tracing_fops = { .open = tracing_open, .read = seq_read, .write = tracing_write_stub, - .llseek = seq_lseek, + .llseek = tracing_seek, .release = tracing_release, }; @@ -3996,13 +4022,9 @@ static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); struct dentry *d_cpu; - /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ - char cpu_dir[7]; - - if (cpu > 999 || cpu < 0) - return; + char cpu_dir[30]; /* 30 characters should be more than enough */ - sprintf(cpu_dir, "cpu%ld", cpu); + snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = debugfs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8d2852baa4..2dec9bcde8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,6 @@ #include <linux/perf_event.h> #include <linux/stringify.h> #include <linux/limits.h> -#include <linux/uaccess.h> #include <asm/bitsperlong.h> #include "trace.h" diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 0a67e041edf..24dc60d9fa1 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) stats->ac_ppid = pid_alive(tsk) ? rcu_dereference(tsk->real_parent)->tgid : 0; rcu_read_unlock(); - stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC; - stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC; - stats->ac_utimescaled = - cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC; - stats->ac_stimescaled = - cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC; + stats->ac_utime = cputime_to_usecs(tsk->utime); + stats->ac_stime = cputime_to_usecs(tsk->stime); + stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); + stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); stats->ac_minflt = tsk->min_flt; stats->ac_majflt = tsk->maj_flt; diff --git a/kernel/user.c b/kernel/user.c index 7e72614b736..5c598ca781d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) * upon function exit. */ static void free_user(struct user_struct *up, unsigned long flags) + __releases(&uidhash_lock) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); @@ -157,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { + put_user_ns(ns); key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); diff --git a/kernel/wait.c b/kernel/wait.c index c4bd3d825f3..b0310eb6cc1 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) } EXPORT_SYMBOL(prepare_to_wait_exclusive); -/* +/** * finish_wait - clean up after waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor @@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) } EXPORT_SYMBOL(finish_wait); -/* +/** * abort_exclusive_wait - abort exclusive waiting in a queue * @q: waitqueue waited on * @wait: wait descriptor - * @state: runstate of the waiter to be woken + * @mode: runstate of the waiter to be woken * @key: key to identify a wait bit queue or %NULL * * Sets current thread back to running state and removes diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d..5b082156cd2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __initdata no_watchdog; +static int no_watchdog; /* boot commands */ @@ -364,7 +364,8 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e5ff2cbaadc..e785b0f2aea 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -661,7 +661,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) { struct worker *worker = kthread_data(task); - if (likely(!(worker->flags & WORKER_NOT_RUNNING))) + if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(cpu)); } @@ -687,7 +687,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, struct global_cwq *gcwq = get_gcwq(cpu); atomic_t *nr_running = get_gcwq_nr_running(cpu); - if (unlikely(worker->flags & WORKER_NOT_RUNNING)) + if (worker->flags & WORKER_NOT_RUNNING) return NULL; /* this can only happen on the local cpu */ @@ -2064,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, * checks and call back into the fixup functions where we * might deadlock. */ - INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); + INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); @@ -3692,7 +3692,8 @@ static int __init init_workqueues(void) system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); - BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || + !system_unbound_wq); return 0; } early_initcall(init_workqueues); |