summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-12-18 21:54:49 +0100
committerIngo Molnar <mingo@elte.hu>2008-12-18 21:54:49 +0100
commitd110ec3a1e1f522e2e9dfceb9c36d6590c26d2d4 (patch)
tree86b2f8f1d22b74b05239525c55bd42e3db6afc03 /kernel
parent343e9099c8152daff20e10d6269edec21da44fc0 (diff)
parent55dac3a5553b13891f0ae4bbd11920619b5436d4 (diff)
Merge branch 'linus' into core/rcu
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/audit_tree.c91
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/cgroup.c24
-rw-r--r--kernel/cgroup_freezer.c19
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c33
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c27
-rw-r--r--kernel/hrtimer.c26
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c68
-rw-r--r--kernel/irq/migration.c11
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kallsyms.c17
-rw-r--r--kernel/kprobes.c23
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/posix-cpu-timers.c9
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c4
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/sched.c48
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c46
-rw-r--r--kernel/sched_fair.c83
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_stats.h15
-rw-r--r--kernel/smp.c18
-rw-r--r--kernel/softirq.c7
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stop_machine.c5
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/timer.c129
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c147
-rw-r--r--kernel/trace/ring_buffer.c176
-rw-r--r--kernel/trace/trace.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/workqueue.c45
49 files changed, 955 insertions, 396 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a3ec66a9d8..19fad003b19 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o
-CFLAGS_REMOVE_sched.o = -mno-spe
-
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
+CFLAGS_REMOVE_sched.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 4414e93d875..ce6d8ea3131 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -61,8 +61,11 @@
#include "audit.h"
-/* No auditing will take place until audit_initialized != 0.
+/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
* (Initialization happens after skb_init is called.) */
+#define AUDIT_DISABLED -1
+#define AUDIT_UNINITIALIZED 0
+#define AUDIT_INITIALIZED 1
static int audit_initialized;
#define AUDIT_OFF 0
@@ -965,6 +968,9 @@ static int __init audit_init(void)
{
int i;
+ if (audit_initialized == AUDIT_DISABLED)
+ return 0;
+
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0,
@@ -976,7 +982,7 @@ static int __init audit_init(void)
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
- audit_initialized = 1;
+ audit_initialized = AUDIT_INITIALIZED;
audit_enabled = audit_default;
audit_ever_enabled |= !!audit_default;
@@ -999,13 +1005,21 @@ __initcall(audit_init);
static int __init audit_enable(char *str)
{
audit_default = !!simple_strtol(str, NULL, 0);
- printk(KERN_INFO "audit: %s%s\n",
- audit_default ? "enabled" : "disabled",
- audit_initialized ? "" : " (after initialization)");
- if (audit_initialized) {
+ if (!audit_default)
+ audit_initialized = AUDIT_DISABLED;
+
+ printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled");
+
+ if (audit_initialized == AUDIT_INITIALIZED) {
audit_enabled = audit_default;
audit_ever_enabled |= !!audit_default;
+ } else if (audit_initialized == AUDIT_UNINITIALIZED) {
+ printk(" (after initialization)");
+ } else {
+ printk(" (until reboot)");
}
+ printk("\n");
+
return 1;
}
@@ -1107,9 +1121,7 @@ unsigned int audit_serial(void)
static inline void audit_get_stamp(struct audit_context *ctx,
struct timespec *t, unsigned int *serial)
{
- if (ctx)
- auditsc_get_stamp(ctx, t, serial);
- else {
+ if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
*t = CURRENT_TIME;
*serial = audit_serial();
}
@@ -1146,7 +1158,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
int reserve;
unsigned long timeout_start = jiffies;
- if (!audit_initialized)
+ if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
if (unlikely(audit_filter_type(type)))
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ba0e0d934f..8b509441f49 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -24,6 +24,7 @@ struct audit_chunk {
struct list_head trees; /* with root here */
int dead;
int count;
+ atomic_long_t refs;
struct rcu_head head;
struct node {
struct list_head list;
@@ -56,7 +57,8 @@ static LIST_HEAD(prune_list);
* tree is refcounted; one reference for "some rules on rules_list refer to
* it", one for each chunk with pointer to it.
*
- * chunk is refcounted by embedded inotify_watch.
+ * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount
+ * of watch contributes 1 to .refs).
*
* node.index allows to get from node.list to containing chunk.
* MSB of that sucker is stolen to mark taggings that we might have to
@@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count)
INIT_LIST_HEAD(&chunk->hash);
INIT_LIST_HEAD(&chunk->trees);
chunk->count = count;
+ atomic_long_set(&chunk->refs, 1);
for (i = 0; i < count; i++) {
INIT_LIST_HEAD(&chunk->owners[i].list);
chunk->owners[i].index = i;
@@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count)
return chunk;
}
-static void __free_chunk(struct rcu_head *rcu)
+static void free_chunk(struct audit_chunk *chunk)
{
- struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
int i;
for (i = 0; i < chunk->count; i++) {
@@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu)
kfree(chunk);
}
-static inline void free_chunk(struct audit_chunk *chunk)
+void audit_put_chunk(struct audit_chunk *chunk)
{
- call_rcu(&chunk->head, __free_chunk);
+ if (atomic_long_dec_and_test(&chunk->refs))
+ free_chunk(chunk);
}
-void audit_put_chunk(struct audit_chunk *chunk)
+static void __put_chunk(struct rcu_head *rcu)
{
- put_inotify_watch(&chunk->watch);
+ struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head);
+ audit_put_chunk(chunk);
}
enum {HASH_SIZE = 128};
@@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
list_for_each_entry_rcu(p, list, hash) {
if (p->watch.inode == inode) {
- get_inotify_watch(&p->watch);
+ atomic_long_inc(&p->refs);
return p;
}
}
@@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static void untag_chunk(struct audit_chunk *chunk, struct node *p)
+static struct audit_chunk *find_chunk(struct node *p)
+{
+ int index = p->index & ~(1U<<31);
+ p -= index;
+ return container_of(p, struct audit_chunk, owners[0]);
+}
+
+static void untag_chunk(struct node *p)
{
+ struct audit_chunk *chunk = find_chunk(p);
struct audit_chunk *new;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
+ if (!pin_inotify_watch(&chunk->watch)) {
+ /*
+ * Filesystem is shutting down; all watches are getting
+ * evicted, just take it off the node list for this
+ * tree and let the eviction logics take care of the
+ * rest.
+ */
+ owner = p->owner;
+ if (owner->root == chunk) {
+ list_del_init(&owner->same_root);
+ owner->root = NULL;
+ }
+ list_del_init(&p->list);
+ p->owner = NULL;
+ put_tree(owner);
+ return;
+ }
+
+ spin_unlock(&hash_lock);
+
+ /*
+ * pin_inotify_watch() succeeded, so the watch won't go away
+ * from under us.
+ */
mutex_lock(&chunk->watch.inode->inotify_mutex);
if (chunk->dead) {
mutex_unlock(&chunk->watch.inode->inotify_mutex);
- return;
+ goto out;
}
owner = p->owner;
@@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
- return;
+ goto out;
}
new = alloc_chunk(size);
@@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p)
inotify_evict_watch(&chunk->watch);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
put_inotify_watch(&chunk->watch);
- return;
+ goto out;
Fallback:
// do the best we can
@@ -277,6 +313,9 @@ Fallback:
put_tree(owner);
spin_unlock(&hash_lock);
mutex_unlock(&chunk->watch.inode->inotify_mutex);
+out:
+ unpin_inotify_watch(&chunk->watch);
+ spin_lock(&hash_lock);
}
static int create_chunk(struct inode *inode, struct audit_tree *tree)
@@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
return 0;
}
-static struct audit_chunk *find_chunk(struct node *p)
-{
- int index = p->index & ~(1U<<31);
- p -= index;
- return container_of(p, struct audit_chunk, owners[0]);
-}
-
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
@@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim)
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
- struct audit_chunk *chunk;
p = list_entry(victim->chunks.next, struct node, list);
- chunk = find_chunk(p);
- get_inotify_watch(&chunk->watch);
- spin_unlock(&hash_lock);
-
- untag_chunk(chunk, p);
- put_inotify_watch(&chunk->watch);
- spin_lock(&hash_lock);
+ untag_chunk(p);
}
spin_unlock(&hash_lock);
put_tree(victim);
@@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree)
while (!list_empty(&tree->chunks)) {
struct node *node;
- struct audit_chunk *chunk;
node = list_entry(tree->chunks.next, struct node, list);
@@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree)
if (!(node->index & (1U<<31)))
break;
- chunk = find_chunk(node);
- get_inotify_watch(&chunk->watch);
- spin_unlock(&hash_lock);
-
- untag_chunk(chunk, node);
-
- put_inotify_watch(&chunk->watch);
- spin_lock(&hash_lock);
+ untag_chunk(node);
}
if (!tree->root && !tree->goner) {
tree->goner = 1;
@@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
static void destroy_watch(struct inotify_watch *watch)
{
struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch);
- free_chunk(chunk);
+ call_rcu(&chunk->head, __put_chunk);
}
static const struct inotify_operations rtree_inotify_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b7d354e2b0e..9fd85a4640a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list)
list_for_each_entry_safe(p, n, in_list, ilist) {
list_del(&p->ilist);
inotify_rm_watch(audit_ih, &p->wdata);
- /* the put matching the get in audit_do_del_rule() */
- put_inotify_watch(&p->wdata);
+ /* the unpin matching the pin in audit_do_del_rule() */
+ unpin_inotify_watch(&p->wdata);
}
}
@@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry,
/* Put parent on the inotify un-registration
* list. Grab a reference before releasing
* audit_filter_mutex, to be released in
- * audit_inotify_unregister(). */
- list_add(&parent->ilist, &inotify_list);
- get_inotify_watch(&parent->wdata);
+ * audit_inotify_unregister().
+ * If filesystem is going away, just leave
+ * the sucker alone, eviction will take
+ * care of it.
+ */
+ if (pin_inotify_watch(&parent->wdata))
+ list_add(&parent->ilist, &inotify_list);
}
}
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index cf5bc2f5f9c..2a3f0afc4d2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1459,7 +1459,6 @@ void audit_free(struct task_struct *tsk)
/**
* audit_syscall_entry - fill in an audit record at syscall entry
- * @tsk: task being audited
* @arch: architecture type
* @major: major syscall type (function)
* @a1: additional syscall register 1
@@ -1548,9 +1547,25 @@ void audit_syscall_entry(int arch, int major,
context->ppid = 0;
}
+void audit_finish_fork(struct task_struct *child)
+{
+ struct audit_context *ctx = current->audit_context;
+ struct audit_context *p = child->audit_context;
+ if (!p || !ctx || !ctx->auditable)
+ return;
+ p->arch = ctx->arch;
+ p->major = ctx->major;
+ memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
+ p->ctime = ctx->ctime;
+ p->dummy = ctx->dummy;
+ p->auditable = ctx->auditable;
+ p->in_syscall = ctx->in_syscall;
+ p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
+ p->ppid = current->pid;
+}
+
/**
* audit_syscall_exit - deallocate audit context after a system call
- * @tsk: task being audited
* @valid: success/failure flag
* @return_code: syscall return value
*
@@ -1942,15 +1957,18 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
*
* Also sets the context as auditable.
*/
-void auditsc_get_stamp(struct audit_context *ctx,
+int auditsc_get_stamp(struct audit_context *ctx,
struct timespec *t, unsigned int *serial)
{
+ if (!ctx->in_syscall)
+ return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
t->tv_sec = ctx->ctime.tv_sec;
t->tv_nsec = ctx->ctime.tv_nsec;
*serial = ctx->serial;
ctx->auditable = 1;
+ return 1;
}
/* global counter which is incremented every time something logs in */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 35eebd5510c..8185a0f0959 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -702,7 +702,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
* any child cgroups exist. This is theoretically supportable
* but involves complex error handling, so it's being left until
* later */
- if (!list_empty(&cgrp->children))
+ if (root->number_of_cgroups > 1)
return -EBUSY;
/* Process each subsystem */
@@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
struct cgroup *cgrp;
struct cgroup_iter it;
struct task_struct *tsk;
+
/*
- * Validate dentry by checking the superblock operations
+ * Validate dentry by checking the superblock operations,
+ * and make sure it's a directory.
*/
- if (dentry->d_sb->s_op != &cgroup_ops)
+ if (dentry->d_sb->s_op != &cgroup_ops ||
+ !S_ISDIR(dentry->d_inode->i_mode))
goto err;
ret = 0;
@@ -2472,10 +2475,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
-
- parent = cgrp->parent;
- root = cgrp->root;
- sb = root->sb;
+ mutex_unlock(&cgroup_mutex);
/*
* Call pre_destroy handlers of subsys. Notify subsystems
@@ -2483,7 +2483,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
*/
cgroup_call_pre_destroy(cgrp);
- if (cgroup_has_css_refs(cgrp)) {
+ mutex_lock(&cgroup_mutex);
+ parent = cgrp->parent;
+ root = cgrp->root;
+ sb = root->sb;
+
+ if (atomic_read(&cgrp->count)
+ || !list_empty(&cgrp->children)
+ || cgroup_has_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -2497,7 +2504,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
list_del(&cgrp->sibling);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
- cgrp->dentry = NULL;
spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7fa476f01d0..fb249e2bcad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -184,9 +184,20 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
- task_lock(task);
+ /*
+ * No lock is needed, since the task isn't on tasklist yet,
+ * so it can't be moved to another cgroup, which means the
+ * freezer won't be removed and will be valid during this
+ * function call.
+ */
freezer = task_freezer(task);
- task_unlock(task);
+
+ /*
+ * The root cgroup is non-freezable, so we can skip the
+ * following check.
+ */
+ if (!freezer->css.cgroup->parent)
+ return;
spin_lock_irq(&freezer->lock);
BUG_ON(freezer->state == CGROUP_FROZEN);
@@ -331,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup,
else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
goal_state = CGROUP_FROZEN;
else
- return -EIO;
+ return -EINVAL;
if (!cgroup_lock_live_group(cgroup))
return -ENODEV;
@@ -350,6 +361,8 @@ static struct cftype files[] = {
static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
{
+ if (!cgroup->parent)
+ return 0;
return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 86d49045dae..8ea32e8d68b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -462,7 +462,7 @@ out:
* It must be called by the arch code on the new cpu, before the new cpu
* enables interrupts and before the "boot" cpu returns from __cpu_up().
*/
-void notify_cpu_starting(unsigned int cpu)
+void __cpuinit notify_cpu_starting(unsigned int cpu)
{
unsigned long val = CPU_STARTING;
@@ -499,3 +499,6 @@ const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
+
+const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
+EXPORT_SYMBOL(cpu_all_bits);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e00526f52e..96c0ba13b8c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
+#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/namei.h>
@@ -584,10 +585,9 @@ static int generate_sched_domains(cpumask_t **domains,
int i, j, k; /* indices for partition finding loops */
cpumask_t *doms; /* resulting partition; i.e. sched domains */
struct sched_domain_attr *dattr; /* attributes for custom domains */
- int ndoms; /* number of sched domains in result */
+ int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] cpumask_t slot */
- ndoms = 0;
doms = NULL;
dattr = NULL;
csa = NULL;
@@ -674,10 +674,8 @@ restart:
* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
*/
doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
- if (!doms) {
- ndoms = 0;
+ if (!doms)
goto done;
- }
/*
* The rest of the code, including the scheduler, can deal with
@@ -732,6 +730,13 @@ restart:
done:
kfree(csa);
+ /*
+ * Fallback to the default domain if kmalloc() failed.
+ * See comments in partition_sched_domains().
+ */
+ if (doms == NULL)
+ ndoms = 1;
+
*domains = doms;
*attributes = dattr;
return ndoms;
@@ -2011,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
* See also the previous routine cpuset_track_online_cpus().
*/
-void cpuset_track_online_nodes(void)
+static int cpuset_track_online_nodes(struct notifier_block *self,
+ unsigned long action, void *arg)
{
cgroup_lock();
- top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
- scan_for_empty_cpusets(&top_cpuset);
+ switch (action) {
+ case MEM_ONLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ break;
+ case MEM_OFFLINE:
+ top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+ scan_for_empty_cpusets(&top_cpuset);
+ break;
+ default:
+ break;
+ }
cgroup_unlock();
+ return NOTIFY_OK;
}
#endif
@@ -2032,6 +2048,7 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
hotcpu_notifier(cpuset_track_online_cpus, 0);
+ hotplug_memory_notifier(cpuset_track_online_nodes, 10);
}
/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 80137a5d946..2d8be7ebb0f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -40,7 +40,6 @@
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
-#include <linux/compat.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
@@ -141,6 +140,11 @@ static void __exit_signal(struct task_struct *tsk)
if (sig) {
flush_sigqueue(&sig->shared_pending);
taskstats_tgid_free(sig);
+ /*
+ * Make sure ->signal can't go away under rq->lock,
+ * see account_group_exec_runtime().
+ */
+ task_rq_unlock_wait(tsk);
__cleanup_signal(sig);
}
}
@@ -1054,14 +1058,6 @@ NORET_TYPE void do_exit(long code)
exit_itimers(tsk->signal);
}
acct_collect(code, group_dead);
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list))
- exit_robust_list(tsk);
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list))
- compat_exit_robust_list(tsk);
-#endif
-#endif
if (group_dead)
tty_audit_exit();
if (unlikely(tsk->audit_context))
diff --git a/kernel/fork.c b/kernel/fork.c
index f6083561dfe..495da2e9a8b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
#include <linux/jiffies.h>
#include <linux/tracehook.h>
#include <linux/futex.h>
+#include <linux/compat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
@@ -314,17 +315,20 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
file = tmp->vm_file;
if (file) {
struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = file->f_mapping;
+
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
-
- /* insert tmp into the share list, just after mpnt */
- spin_lock(&file->f_mapping->i_mmap_lock);
+ spin_lock(&mapping->i_mmap_lock);
+ if (tmp->vm_flags & VM_SHARED)
+ mapping->i_mmap_writable++;
tmp->vm_truncate_count = mpnt->vm_truncate_count;
- flush_dcache_mmap_lock(file->f_mapping);
+ flush_dcache_mmap_lock(mapping);
+ /* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt);
- flush_dcache_mmap_unlock(file->f_mapping);
- spin_unlock(&file->f_mapping->i_mmap_lock);
+ flush_dcache_mmap_unlock(mapping);
+ spin_unlock(&mapping->i_mmap_lock);
}
/*
@@ -519,6 +523,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
struct completion *vfork_done = tsk->vfork_done;
+ /* Get rid of any futexes when releasing the mm */
+#ifdef CONFIG_FUTEX
+ if (unlikely(tsk->robust_list))
+ exit_robust_list(tsk);
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list))
+ compat_exit_robust_list(tsk);
+#endif
+#endif
+
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
@@ -1387,6 +1401,7 @@ long do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
+ audit_finish_fork(p);
tracehook_report_clone(trace, regs, clone_flags, nr, p);
/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b465dfde42..47e63349d1b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
/* Timer is expired, act upon the callback mode */
switch(timer->cb_mode) {
- case HRTIMER_CB_IRQSAFE_NO_RESTART:
- debug_hrtimer_deactivate(timer);
- /*
- * We can call the callback from here. No restart
- * happens, so no danger of recursion
- */
- BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
- return 1;
case HRTIMER_CB_IRQSAFE_PERCPU:
case HRTIMER_CB_IRQSAFE_UNLOCKED:
/*
@@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
*/
debug_hrtimer_deactivate(timer);
return 1;
- case HRTIMER_CB_IRQSAFE:
case HRTIMER_CB_SOFTIRQ:
/*
* Move everything else into the softirq pending list !
@@ -1209,6 +1200,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
int restart;
+ int emulate_hardirq_ctx = 0;
timer = list_entry(cpu_base->cb_pending.next,
struct hrtimer, cb_entry);
@@ -1217,10 +1209,24 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
timer_stats_account_hrtimer(timer);
fn = timer->function;
+ /*
+ * A timer might have been added to the cb_pending list
+ * when it was migrated during a cpu-offline operation.
+ * Emulate hardirq context for such timers.
+ */
+ if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+ timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
+ emulate_hardirq_ctx = 1;
+
__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
spin_unlock_irq(&cpu_base->lock);
- restart = fn(timer);
+ if (unlikely(emulate_hardirq_ctx)) {
+ local_irq_disable();
+ restart = fn(timer);
+ local_irq_enable();
+ } else
+ restart = fn(timer);
spin_lock_irq(&cpu_base->lock);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9767e64198..64c1c7253da 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
+extern int irq_select_affinity_usr(unsigned int irq);
+
/*
* Debugging printout:
*/
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c62..801addda3c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq)
int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
if (!desc->chip->set_affinity)
return -EINVAL;
+ spin_lock_irqsave(&desc->lock, flags);
+
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
- spin_unlock_irqrestore(&desc->lock, flags);
- } else
- set_pending_irq(irq, cpumask);
+ } else {
+ desc->status |= IRQ_MOVE_PENDING;
+ desc->pending_mask = cpumask;
+ }
#else
desc->affinity = cpumask;
desc->chip->set_affinity(irq, cpumask);
#endif
+ desc->status |= IRQ_AFFINITY_SET;
+ spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
@@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
/*
* Generic version of the affinity autoselector.
*/
-int irq_select_affinity(unsigned int irq)
+int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
{
cpumask_t mask;
- struct irq_desc *desc;
if (!irq_can_set_affinity(irq))
return 0;
cpus_and(mask, cpu_online_map, irq_default_affinity);
- desc = irq_to_desc(irq);
+ /*
+ * Preserve an userspace affinity setup, but make sure that
+ * one of the targets is online.
+ */
+ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
+ if (cpus_intersects(desc->affinity, cpu_online_map))
+ mask = desc->affinity;
+ else
+ desc->status &= ~IRQ_AFFINITY_SET;
+ }
+
desc->affinity = mask;
desc->chip->set_affinity(irq, mask);
return 0;
}
+#else
+static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+{
+ return irq_select_affinity(irq);
+}
#endif
+/*
+ * Called when affinity is set via /proc/irq
+ */
+int irq_select_affinity_usr(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&desc->lock, flags);
+ ret = do_irq_select_affinity(irq, desc);
+ spin_unlock_irqrestore(&desc->lock, flags);
+
+ return ret;
+}
+
+#else
+static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+{
+ return 0;
+}
#endif
/**
@@ -327,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_warning("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n", irq,
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -445,8 +483,12 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
/* Undo nested disables: */
desc->depth = 1;
+ /* Exclude IRQ from balancing if requested */
+ if (new->flags & IRQF_NOBALANCING)
+ desc->status |= IRQ_NO_BALANCING;
+
/* Set default affinity mask once everything is setup */
- irq_select_affinity(irq);
+ do_irq_select_affinity(irq, desc);
} else if ((new->flags & IRQF_TRIGGER_MASK)
&& (new->flags & IRQF_TRIGGER_MASK)
@@ -459,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
*p = new;
- /* Exclude IRQ from balancing */
- if (new->flags & IRQF_NOBALANCING)
- desc->status |= IRQ_NO_BALANCING;
-
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 90b920d3f52..9db681d9581 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,17 +1,6 @@
#include <linux/irq.h>
-void set_pending_irq(unsigned int irq, cpumask_t mask)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- spin_lock_irqsave(&desc->lock, flags);
- desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
- spin_unlock_irqrestore(&desc->lock, flags);
-}
-
void move_masked_irq(int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 4d161c70ba5..d257e7d6a8a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!cpus_intersects(new_value, cpu_online_map))
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- return irq_select_affinity(irq) ? -EINVAL : count;
+ return irq_select_affinity_usr(irq) ? -EINVAL : count;
irq_set_affinity(irq, new_value);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 5072cf1685a..7b8b0f21a5b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address)
char *modname;
const char *name;
unsigned long offset, size;
- char namebuf[KSYM_NAME_LEN];
+ int len;
- name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+ name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address);
+ if (name != buffer)
+ strcpy(buffer, name);
+ len = strlen(buffer);
+ buffer += len;
+
if (modname)
- return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
- size, modname);
+ len += sprintf(buffer, "+%#lx/%#lx [%s]",
+ offset, size, modname);
else
- return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+ len += sprintf(buffer, "+%#lx/%#lx", offset, size);
+
+ return len;
}
/* Look up a kernel symbol and print it to the kernel messages. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 8b57a2597f2..9f8a3f25259 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,7 @@ static bool kprobe_enabled;
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
- spinlock_t lock ____cacheline_aligned;
+ spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
@@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = addr;
- if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr))
+ preempt_disable();
+ if (!__kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr)) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 0;
/*
* Check if are we probing a module.
*/
- probed_mod = module_text_address((unsigned long) p->addr);
+ probed_mod = __module_text_address((unsigned long) p->addr);
if (probed_mod) {
- struct module *calling_mod = module_text_address(called_from);
+ struct module *calling_mod;
+ calling_mod = __module_text_address(called_from);
/*
* We must allow modules to probe themself and in this case
* avoid incrementing the module refcount, so as to allow
* unloading of self probing modules.
*/
if (calling_mod && calling_mod != probed_mod) {
- if (unlikely(!try_module_get(probed_mod)))
+ if (unlikely(!try_module_get(probed_mod))) {
+ preempt_enable();
return -EINVAL;
+ }
p->mod_refcounted = 1;
} else
probed_mod = NULL;
}
+ preempt_enable();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
@@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
struct kprobe *old_p;
if (p->mod_refcounted) {
+ /*
+ * Since we've already incremented refcount,
+ * we don't need to disable preemption.
+ */
mod = module_text_address((unsigned long)p->addr);
if (mod)
module_put(mod);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 5e7b45c5692..449db466bdb 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -191,7 +191,7 @@ static int lstats_show(struct seq_file *m, void *v)
latency_record[i].time,
latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
- char sym[KSYM_NAME_LEN];
+ char sym[KSYM_SYMBOL_LEN];
char *c;
if (!latency_record[i].backtrace[q])
break;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06e157119d2..46a404173db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3276,10 +3276,10 @@ void __init lockdep_info(void)
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
- printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
- printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6513aac8e99..4d5088355bf 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -167,6 +167,7 @@ static const struct tnt tnts[] = {
* 'M' - System experienced a machine check exception.
* 'B' - System has hit bad_page.
* 'U' - Userspace-defined naughtiness.
+ * 'D' - Kernel has oopsed before
* 'A' - ACPI table overridden.
* 'W' - Taint on warning.
* 'C' - modules from drivers/staging are loaded.
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 153dcb2639c..4e5288a831d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -311,7 +311,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
struct task_cputime cputime;
thread_group_cputime(p, &cputime);
- switch (which_clock) {
+ switch (CPUCLOCK_WHICH(which_clock)) {
default:
return -EINVAL;
case CPUCLOCK_PROF:
@@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
*/
static inline int fastpath_timer_check(struct task_struct *tsk)
{
- struct signal_struct *sig = tsk->signal;
+ struct signal_struct *sig;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal/sighand */
+ if (unlikely(tsk->exit_state))
return 0;
if (!task_cputime_zero(&tsk->cputime_expires)) {
@@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
return 1;
}
+
+ sig = tsk->signal;
if (!task_cputime_zero(&sig->cputime_expires)) {
struct task_cputime group_sample;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 19122cf6d82..b8f7ce9473e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -174,7 +174,7 @@ static void suspend_test_finish(const char *label)
* has some performance issues. The stack dump of a WARN_ON
* is more likely to get the right attention than a printk...
*/
- WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
+ WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
}
#else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b7713b53d07..6da14358537 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -633,7 +633,7 @@ void swsusp_close(fmode_t mode)
return;
}
- blkdev_put(resume_bdev, mode); /* move up */
+ blkdev_put(resume_bdev, mode);
}
static int swsusp_header_init(void)
diff --git a/kernel/profile.c b/kernel/profile.c
index 9830a037d8d..dc41827fbfe 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -351,7 +351,7 @@ out:
put_cpu();
}
-static int __devinit profile_cpu_callback(struct notifier_block *info,
+static int __cpuinit profile_cpu_callback(struct notifier_block *info,
unsigned long action, void *__cpu)
{
int node, cpu = (unsigned long)__cpu;
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
};
#ifdef CONFIG_SMP
-static void __init profile_nop(void *unused)
+static inline void profile_nop(void *unused)
{
}
@@ -596,7 +596,7 @@ out_cleanup:
#define create_hash_tables() ({ 0; })
#endif
-int create_proc_profile(void)
+int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
{
struct proc_dir_entry *entry;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1e68e4c39e2..4c8bcd7dd8e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
return (copied == sizeof(data)) ? 0 : -EIO;
}
-#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE
+#if defined CONFIG_COMPAT
#include <linux/compat.h>
int compat_ptrace_request(struct task_struct *child, compat_long_t request,
@@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
unlock_kernel();
return ret;
}
-#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/relay.c b/kernel/relay.c
index 8d13a7855c0..09ac2008f77 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan)
}
mutex_lock(&relay_channels_mutex);
- for_each_online_cpu(i)
+ for_each_possible_cpu(i)
if (chan->buf[i])
__relay_reset(chan->buf[i], 0);
mutex_unlock(&relay_channels_mutex);
@@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename,
return chan;
free_bufs:
- for_each_online_cpu(i) {
- if (!chan->buf[i])
- break;
- relay_close_buf(chan->buf[i]);
+ for_each_possible_cpu(i) {
+ if (chan->buf[i])
+ relay_close_buf(chan->buf[i]);
}
kref_put(&chan->kref, relay_destroy_channel);
@@ -1318,12 +1317,9 @@ static ssize_t relay_file_splice_read(struct file *in,
if (ret < 0)
break;
else if (!ret) {
- if (spliced)
- break;
- if (flags & SPLICE_F_NONBLOCK) {
+ if (flags & SPLICE_F_NONBLOCK)
ret = -EAGAIN;
- break;
- }
+ break;
}
*ppos += ret;
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc6f46..e4bb1dd7b30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -397,9 +397,9 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next;
+ struct sched_entity *curr, *next, *last;
- unsigned long nr_spread_over;
+ unsigned int nr_spread_over;
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -969,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
}
}
+void task_rq_unlock_wait(struct task_struct *p)
+{
+ struct rq *rq = task_rq(p);
+
+ smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+ spin_unlock_wait(&rq->lock);
+}
+
static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
@@ -1445,9 +1453,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
- if (rq->nr_running)
- rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+ if (nr_running)
+ rq->avg_load_per_task = rq->load.weight / nr_running;
+ else
+ rq->avg_load_per_task = 0;
return rq->avg_load_per_task;
}
@@ -1805,7 +1816,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next))
+ if (sched_feat(CACHE_HOT_BUDDY) &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
return 1;
if (p->sched_class != &fair_sched_class)
@@ -5858,6 +5871,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
+ spin_lock_irqsave(&rq->lock, flags);
+
__sched_fork(idle);
idle->se.exec_start = sched_clock();
@@ -5865,7 +5880,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->cpus_allowed = cpumask_of_cpu(cpu);
__set_task_cpu(idle, cpu);
- spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
idle->oncpu = 1;
@@ -6573,7 +6587,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
req = list_entry(rq->migration_queue.next,
struct migration_req, list);
list_del_init(&req->list);
+ spin_unlock_irq(&rq->lock);
complete(&req->done);
+ spin_lock_irq(&rq->lock);
}
spin_unlock_irq(&rq->lock);
break;
@@ -6875,15 +6891,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
- for (tmp = sd; tmp; tmp = tmp->parent) {
+ for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
+
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
- }
+ } else
+ tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
@@ -7672,6 +7690,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
+ kfree(rd);
return -ENOMEM;
#endif
}
@@ -7773,13 +7792,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* The passed in 'doms_new' should be kmalloc'd. This routine takes
* ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
*
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
*
* Call with hotplug lock held
*/
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 81787248b60..e8ab096ddfe 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
/*
* scd->clock = clamp(scd->tick_gtod + delta,
- * max(scd->tick_gtod, scd->clock),
- * max(scd->clock, scd->tick_gtod + TICK_NSEC));
+ * max(scd->tick_gtod, scd->clock),
+ * scd->tick_gtod + TICK_NSEC);
*/
clock = scd->tick_gtod + delta;
min_clock = wrap_max(scd->tick_gtod, scd->clock);
- max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+ max_clock = scd->tick_gtod + TICK_NSEC;
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ae17762ec3..26ed8e3d1c1 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -144,7 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
- min_vruntime = rq->cfs.min_vruntime;
+ min_vruntime = cfs_rq->min_vruntime;
rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime;
spin_unlock_irqrestore(&rq->lock, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
@@ -161,26 +161,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(spread0));
SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-
- P(yld_exp_empty);
- P(yld_act_empty);
- P(yld_both_empty);
- P(yld_count);
- P(sched_switch);
- P(sched_count);
- P(sched_goidle);
-
- P(ttwu_count);
- P(ttwu_local);
-
- P(bkl_count);
-
-#undef P
-#endif
- SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
+ SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
@@ -260,6 +242,25 @@ static void print_cpu(struct seq_file *m, int cpu)
#undef P
#undef PN
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+
+ P(yld_exp_empty);
+ P(yld_act_empty);
+ P(yld_both_empty);
+ P(yld_count);
+
+ P(sched_switch);
+ P(sched_count);
+ P(sched_goidle);
+
+ P(ttwu_count);
+ P(ttwu_local);
+
+ P(bkl_count);
+
+#undef P
+#endif
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
@@ -422,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
#undef __P
{
+ unsigned int this_cpu = raw_smp_processor_id();
u64 t0, t1;
- t0 = sched_clock();
- t1 = sched_clock();
+ t0 = cpu_clock(this_cpu);
+ t1 = cpu_clock(this_cpu);
SEQ_printf(m, "%-35s:%21Ld\n",
"clock-delta", (long long)(t1-t0));
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ce514afd78f..98345e45b05 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -341,23 +341,20 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->rb_leftmost = next_node;
}
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
-
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->rb_leftmost;
-}
-
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
- return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
+ struct rb_node *left = cfs_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_entity, run_node);
}
-static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -719,6 +716,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -741,6 +747,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
#endif
}
+ clear_buddies(cfs_rq, se);
+
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
@@ -794,24 +802,15 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
-static struct sched_entity *
-pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
- if (!cfs_rq->next || wakeup_preempt_entity(cfs_rq->next, se) == 1)
- return se;
-
- return cfs_rq->next;
-}
-
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = NULL;
+ struct sched_entity *se = __pick_next_entity(cfs_rq);
- if (first_fair(cfs_rq)) {
- se = __pick_next_entity(cfs_rq);
- se = pick_next(cfs_rq, se);
- set_next_entity(cfs_rq, se);
- }
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
+ return cfs_rq->next;
+
+ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
+ return cfs_rq->last;
return se;
}
@@ -983,6 +982,8 @@ static void yield_task_fair(struct rq *rq)
if (unlikely(cfs_rq->nr_running == 1))
return;
+ clear_buddies(cfs_rq, se);
+
if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
/*
@@ -1325,26 +1326,53 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
return 0;
}
+static void set_last_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
+}
+
+static void set_next_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
{
struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
if (unlikely(rt_prio(p->prio))) {
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+
update_rq_clock(rq);
update_curr(cfs_rq);
resched_task(curr);
return;
}
+ if (unlikely(p->sched_class != &fair_sched_class))
+ return;
+
if (unlikely(se == pse))
return;
- cfs_rq_of(pse)->next = pse;
+ /*
+ * Only set the backward buddy when the current task is still on the
+ * rq. This can happen when a wakeup gets interleaved with schedule on
+ * the ->pre_schedule() or idle_balance() point, either of which can
+ * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class, for
+ * obvious reasons its a bad idea to schedule back to the idle thread.
+ */
+ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
+ set_last_buddy(se);
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1396,6 +1424,7 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index fda01621829..da5d93b5d2c 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,3 +12,4 @@ SCHED_FEAT(LB_BIAS, 1)
SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
+SCHED_FEAT(LAST_BUDDY, 1)
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ee71bec1da6..7dbf72a2b02 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
{
struct signal_struct *sig;
- sig = tsk->signal;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
return;
+
+ sig = tsk->signal;
if (sig->cputime.totals) {
struct task_cputime *times;
@@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
{
struct signal_struct *sig;
- sig = tsk->signal;
- if (unlikely(!sig))
+ /* tsk == current, ensure it is safe to use ->signal */
+ if (unlikely(tsk->exit_state))
return;
+
+ sig = tsk->signal;
if (sig->cputime.totals) {
struct task_cputime *times;
@@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
struct signal_struct *sig;
sig = tsk->signal;
+ /* see __exit_signal()->task_rq_unlock_wait() */
+ barrier();
if (unlikely(!sig))
return;
+
if (sig->cputime.totals) {
struct task_cputime *times;
diff --git a/kernel/smp.c b/kernel/smp.c
index f362a855377..75c8dde58c5 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -51,10 +51,6 @@ static void csd_flag_wait(struct call_single_data *data)
{
/* Wait for response */
do {
- /*
- * We need to see the flags store in the IPI handler
- */
- smp_mb();
if (!(data->flags & CSD_FLAG_WAIT))
break;
cpu_relax();
@@ -76,6 +72,11 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
list_add_tail(&data->list, &dst->list);
spin_unlock_irqrestore(&dst->lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
if (ipi)
arch_send_call_function_single_ipi(cpu);
@@ -157,7 +158,7 @@ void generic_smp_call_function_single_interrupt(void)
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
- smp_mb();
+ smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
@@ -191,7 +192,7 @@ void generic_smp_call_function_single_interrupt(void)
/*
* See comment on outer loop
*/
- smp_mb();
+ smp_read_barrier_depends();
}
}
@@ -370,6 +371,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
+ /*
+ * Make the list addition visible before sending the ipi.
+ */
+ smp_mb();
+
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi(mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7110daeb9a9..e7c69a720d6 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,10 +269,11 @@ void irq_enter(void)
{
int cpu = smp_processor_id();
- if (idle_cpu(cpu) && !in_interrupt())
+ if (idle_cpu(cpu) && !in_interrupt()) {
+ __irq_enter();
tick_check_idle(cpu);
-
- __irq_enter();
+ } else
+ __irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed73..dc0b3be6b7d 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -188,7 +188,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
if ((long)(now - t->last_switch_timestamp) <
sysctl_hung_task_timeout_secs)
return;
- if (sysctl_hung_task_warnings < 0)
+ if (!sysctl_hung_task_warnings)
return;
sysctl_hung_task_warnings--;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bc4c00872c..24e8ceacc38 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -112,7 +112,7 @@ static int chill(void *unused)
int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
{
struct work_struct *sm_work;
- int i;
+ int i, ret;
/* Set up initial state. */
mutex_lock(&lock);
@@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
/* This will release the thread on our CPU. */
put_cpu();
flush_workqueue(stop_machine_wq);
+ ret = active.fnret;
mutex_unlock(&lock);
- return active.fnret;
+ return ret;
}
int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b0..e14a2328170 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
cond_syscall(sys_listen);
cond_syscall(sys_accept);
-cond_syscall(sys_paccept);
+cond_syscall(sys_accept4);
cond_syscall(sys_connect);
cond_syscall(sys_getsockname);
cond_syscall(sys_getpeername);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9d048fa2d90..3d56fe7570d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -176,6 +176,9 @@ extern struct ctl_table random_table[];
#ifdef CONFIG_INOTIFY_USER
extern struct ctl_table inotify_table[];
#endif
+#ifdef CONFIG_EPOLL
+extern struct ctl_table epoll_table[];
+#endif
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
@@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = {
.child = inotify_table,
},
#endif
+#ifdef CONFIG_EPOLL
+ {
+ .procname = "epoll",
+ .mode = 0555,
+ .child = epoll_table,
+ },
+#endif
#endif
{
.ctl_name = KERN_SETUID_DUMPABLE,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 5bbb1044f84..342fc9ccab4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -568,6 +568,9 @@ static void tick_nohz_switch_to_nohz(void)
*/
static void tick_nohz_kick_tick(int cpu)
{
+#if 0
+ /* Switch back to 2.6.27 behaviour */
+
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t delta, now;
@@ -584,6 +587,7 @@ static void tick_nohz_kick_tick(int cpu)
return;
tick_nohz_restart(ts, now);
+#endif
}
#else
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e7acfb482a6..fa05e88aa76 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -518,6 +518,28 @@ void update_wall_time(void)
/* correct the clock when NTP error is too big */
clocksource_adjust(offset);
+ /*
+ * Since in the loop above, we accumulate any amount of time
+ * in xtime_nsec over a second into xtime.tv_sec, its possible for
+ * xtime_nsec to be fairly small after the loop. Further, if we're
+ * slightly speeding the clocksource up in clocksource_adjust(),
+ * its possible the required corrective factor to xtime_nsec could
+ * cause it to underflow.
+ *
+ * Now, we cannot simply roll the accumulated second back, since
+ * the NTP subsystem has been notified via second_overflow. So
+ * instead we push xtime_nsec forward by the amount we underflowed,
+ * and add that amount into the error.
+ *
+ * We'll correct this error next time through this function, when
+ * xtime_nsec is not as small.
+ */
+ if (unlikely((s64)clock->xtime_nsec < 0)) {
+ s64 neg = -(s64)clock->xtime_nsec;
+ clock->xtime_nsec = 0;
+ clock->error += neg << (NTP_SCALE_SHIFT - clock->shift);
+ }
+
/* store full nanoseconds into xtime after rounding it up and
* add the remainder to the error difference.
*/
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf373c5..dbd50fabe4c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -112,27 +112,8 @@ timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
tbase_get_deferrable(timer->base));
}
-/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
+static unsigned long round_jiffies_common(unsigned long j, int cpu,
+ bool force_up)
{
int rem;
unsigned long original = j;
@@ -154,8 +135,9 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
* due to delays of the timer irq, long irq off times etc etc) then
* we should round down to the whole second, not up. Use 1/4th second
* as cutoff for this rounding as an extreme upper bound for this.
+ * But never round down if @force_up is set.
*/
- if (rem < HZ/4) /* round down */
+ if (rem < HZ/4 && !force_up) /* round down */
j = j - rem;
else /* round up */
j = j - rem + HZ;
@@ -167,6 +149,31 @@ unsigned long __round_jiffies(unsigned long j, int cpu)
return original;
return j;
}
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, false);
+}
EXPORT_SYMBOL_GPL(__round_jiffies);
/**
@@ -191,13 +198,10 @@ EXPORT_SYMBOL_GPL(__round_jiffies);
*/
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
- /*
- * In theory the following code can skip a jiffy in case jiffies
- * increments right between the addition and the later subtraction.
- * However since the entire point of this function is to use approximate
- * timeouts, it's entirely ok to not handle that.
- */
- return __round_jiffies(j + jiffies, cpu) - jiffies;
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);
@@ -218,7 +222,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative);
*/
unsigned long round_jiffies(unsigned long j)
{
- return __round_jiffies(j, raw_smp_processor_id());
+ return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);
@@ -243,6 +247,71 @@ unsigned long round_jiffies_relative(unsigned long j)
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);
+/**
+ * __round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up(unsigned long j, int cpu)
+{
+ return round_jiffies_common(j, cpu, true);
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up);
+
+/**
+ * __round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * This is the same as __round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
+{
+ unsigned long j0 = jiffies;
+
+ /* Use j0 because jiffies might change while we run */
+ return round_jiffies_common(j + j0, cpu, true) - j0;
+}
+EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
+
+/**
+ * round_jiffies_up - function to round jiffies up to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up(unsigned long j)
+{
+ return round_jiffies_common(j, raw_smp_processor_id(), true);
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up);
+
+/**
+ * round_jiffies_up_relative - function to round jiffies up to a full second
+ * @j: the time in (relative) jiffies that should be rounded
+ *
+ * This is the same as round_jiffies_relative() except that it will never
+ * round down. This is useful for timeouts for which the exact time
+ * of firing does not matter too much, as long as they don't fire too
+ * early.
+ */
+unsigned long round_jiffies_up_relative(unsigned long j)
+{
+ return __round_jiffies_up_relative(j, raw_smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
+
static inline void set_running_timer(struct tvec_base *base,
struct timer_list *timer)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b58f43bec36..33dbefd471e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -25,7 +25,7 @@ config TRACING
bool
select DEBUG_FS
select RING_BUFFER
- select STACKTRACE
+ select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
select NOP_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a39d24568c..78db083390f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -185,7 +185,6 @@ enum {
};
static int ftrace_filtered;
-static int tracing_on;
static LIST_HEAD(ftrace_new_addrs);
@@ -327,96 +326,89 @@ ftrace_record_ip(unsigned long ip)
static int
__ftrace_replace_code(struct dyn_ftrace *rec,
- unsigned char *old, unsigned char *new, int enable)
+ unsigned char *nop, int enable)
{
unsigned long ip, fl;
+ unsigned char *call, *old, *new;
ip = rec->ip;
- if (ftrace_filtered && enable) {
+ /*
+ * If this record is not to be traced and
+ * it is not enabled then do nothing.
+ *
+ * If this record is not to be traced and
+ * it is enabled then disabled it.
+ *
+ */
+ if (rec->flags & FTRACE_FL_NOTRACE) {
+ if (rec->flags & FTRACE_FL_ENABLED)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ else
+ return 0;
+
+ } else if (ftrace_filtered && enable) {
/*
- * If filtering is on:
- *
- * If this record is set to be filtered and
- * is enabled then do nothing.
- *
- * If this record is set to be filtered and
- * it is not enabled, enable it.
- *
- * If this record is not set to be filtered
- * and it is not enabled do nothing.
- *
- * If this record is set not to trace then
- * do nothing.
- *
- * If this record is set not to trace and
- * it is enabled then disable it.
- *
- * If this record is not set to be filtered and
- * it is enabled, disable it.
+ * Filtering is on:
*/
- fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
- FTRACE_FL_ENABLED);
+ fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
- if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
- (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
- !fl || (fl == FTRACE_FL_NOTRACE))
+ /* Record is filtered and enabled, do nothing */
+ if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
return 0;
- /*
- * If it is enabled disable it,
- * otherwise enable it!
- */
- if (fl & FTRACE_FL_ENABLED) {
- /* swap new and old */
- new = old;
- old = ftrace_call_replace(ip, FTRACE_ADDR);
+ /* Record is not filtered and is not enabled do nothing */
+ if (!fl)
+ return 0;
+
+ /* Record is not filtered but enabled, disable it */
+ if (fl == FTRACE_FL_ENABLED)
rec->flags &= ~FTRACE_FL_ENABLED;
- } else {
- new = ftrace_call_replace(ip, FTRACE_ADDR);
+ else
+ /* Otherwise record is filtered but not enabled, enable it */
rec->flags |= FTRACE_FL_ENABLED;
- }
} else {
+ /* Disable or not filtered */
if (enable) {
- /*
- * If this record is set not to trace and is
- * not enabled, do nothing.
- */
- fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
- if (fl == FTRACE_FL_NOTRACE)
- return 0;
-
- new = ftrace_call_replace(ip, FTRACE_ADDR);
- } else
- old = ftrace_call_replace(ip, FTRACE_ADDR);
-
- if (enable) {
+ /* if record is enabled, do nothing */
if (rec->flags & FTRACE_FL_ENABLED)
return 0;
+
rec->flags |= FTRACE_FL_ENABLED;
+
} else {
+
+ /* if record is not enabled do nothing */
if (!(rec->flags & FTRACE_FL_ENABLED))
return 0;
+
rec->flags &= ~FTRACE_FL_ENABLED;
}
}
+ call = ftrace_call_replace(ip, FTRACE_ADDR);
+
+ if (rec->flags & FTRACE_FL_ENABLED) {
+ old = nop;
+ new = call;
+ } else {
+ old = call;
+ new = nop;
+ }
+
return ftrace_modify_code(ip, old, new);
}
static void ftrace_replace_code(int enable)
{
int i, failed;
- unsigned char *new = NULL, *old = NULL;
+ unsigned char *nop = NULL;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- if (enable)
- old = ftrace_nop_replace();
- else
- new = ftrace_nop_replace();
+ nop = ftrace_nop_replace();
for (pg = ftrace_pages_start; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
@@ -434,7 +426,7 @@ static void ftrace_replace_code(int enable)
unfreeze_record(rec);
}
- failed = __ftrace_replace_code(rec, old, new, enable);
+ failed = __ftrace_replace_code(rec, nop, enable);
if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
rec->flags |= FTRACE_FL_FAILED;
if ((system_state == SYSTEM_BOOTING) ||
@@ -506,13 +498,10 @@ static int __ftrace_modify_code(void *data)
{
int *command = data;
- if (*command & FTRACE_ENABLE_CALLS) {
+ if (*command & FTRACE_ENABLE_CALLS)
ftrace_replace_code(1);
- tracing_on = 1;
- } else if (*command & FTRACE_DISABLE_CALLS) {
+ else if (*command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
- tracing_on = 0;
- }
if (*command & FTRACE_UPDATE_TRACE_FUNC)
ftrace_update_ftrace_func(ftrace_trace_function);
@@ -538,8 +527,7 @@ static void ftrace_startup(void)
mutex_lock(&ftrace_start_lock);
ftrace_start++;
- if (ftrace_start == 1)
- command |= FTRACE_ENABLE_CALLS;
+ command |= FTRACE_ENABLE_CALLS;
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
@@ -677,7 +665,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
cnt = num_to_init / ENTRIES_PER_PAGE;
pr_info("ftrace: allocating %ld entries in %d pages\n",
- num_to_init, cnt);
+ num_to_init, cnt + 1);
for (i = 0; i < cnt; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -738,6 +726,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
((iter->flags & FTRACE_ITER_FAILURES) &&
!(rec->flags & FTRACE_FL_FAILED)) ||
+ ((iter->flags & FTRACE_ITER_FILTER) &&
+ !(rec->flags & FTRACE_FL_FILTER)) ||
+
((iter->flags & FTRACE_ITER_NOTRACE) &&
!(rec->flags & FTRACE_FL_NOTRACE))) {
rec = NULL;
@@ -757,13 +748,11 @@ static void *t_start(struct seq_file *m, loff_t *pos)
void *p = NULL;
loff_t l = -1;
- if (*pos != iter->pos) {
- for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
- ;
- } else {
- l = *pos;
- p = t_next(m, p, &l);
- }
+ if (*pos > iter->pos)
+ *pos = iter->pos;
+
+ l = *pos;
+ p = t_next(m, p, &l);
return p;
}
@@ -774,15 +763,21 @@ static void t_stop(struct seq_file *m, void *p)
static int t_show(struct seq_file *m, void *v)
{
+ struct ftrace_iterator *iter = m->private;
struct dyn_ftrace *rec = v;
char str[KSYM_SYMBOL_LEN];
+ int ret = 0;
if (!rec)
return 0;
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
- seq_printf(m, "%s\n", str);
+ ret = seq_printf(m, "%s\n", str);
+ if (ret < 0) {
+ iter->pos--;
+ iter->idx--;
+ }
return 0;
}
@@ -808,7 +803,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return -ENOMEM;
iter->pg = ftrace_pages_start;
- iter->pos = -1;
+ iter->pos = 0;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -895,7 +890,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
- iter->pos = -1;
+ iter->pos = 0;
iter->flags = enable ? FTRACE_ITER_FILTER :
FTRACE_ITER_NOTRACE;
@@ -1186,7 +1181,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
mutex_lock(&ftrace_sysctl_lock);
mutex_lock(&ftrace_start_lock);
- if (iter->filtered && ftrace_start && ftrace_enabled)
+ if (ftrace_start && ftrace_enabled)
ftrace_run_update_code(FTRACE_ENABLE_CALLS);
mutex_unlock(&ftrace_start_lock);
mutex_unlock(&ftrace_sysctl_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index cedf4e26828..668bbb5ef2b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,14 +16,49 @@
#include <linux/list.h>
#include <linux/fs.h>
+#include "trace.h"
+
+/* Global flag to disable all recording to ring buffers */
+static int ring_buffers_off __read_mostly;
+
+/**
+ * tracing_on - enable all tracing buffers
+ *
+ * This function enables all tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+ ring_buffers_off = 0;
+}
+
+/**
+ * tracing_off - turn off all tracing buffers
+ *
+ * This function stops all tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ ring_buffers_off = 1;
+}
+
/* Up this if you want to test the TIME_EXTENTS and normalization */
#define DEBUG_SHIFT 0
/* FIXME!!! */
u64 ring_buffer_time_stamp(int cpu)
{
+ u64 time;
+
+ preempt_disable_notrace();
/* shift to debug/test normalization and TIME_EXTENTS */
- return sched_clock() << DEBUG_SHIFT;
+ time = sched_clock() << DEBUG_SHIFT;
+ preempt_enable_notrace();
+
+ return time;
}
void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
@@ -503,6 +538,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
LIST_HEAD(pages);
int i, cpu;
+ /*
+ * Always succeed at resizing a non-existent buffer:
+ */
+ if (!buffer)
+ return size;
+
size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
size *= BUF_PAGE_SIZE;
buffer_size = buffer->pages * BUF_PAGE_SIZE;
@@ -576,6 +617,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
list_del_init(&page->list);
free_buffer_page(page);
}
+ mutex_unlock(&buffer->mutex);
return -ENOMEM;
}
@@ -1022,8 +1064,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
u64 ts, delta;
int commit = 0;
+ int nr_loops = 0;
again:
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (unlikely(++nr_loops > 1000)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
ts = ring_buffer_time_stamp(cpu_buffer->cpu);
/*
@@ -1045,7 +1102,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
/* Did the write stamp get updated already? */
if (unlikely(ts < cpu_buffer->write_stamp))
- goto again;
+ delta = 0;
if (test_time_stamp(delta)) {
@@ -1118,6 +1175,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
struct ring_buffer_event *event;
int cpu, resched;
+ if (ring_buffers_off)
+ return NULL;
+
if (atomic_read(&buffer->record_disabled))
return NULL;
@@ -1155,7 +1215,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
out:
if (resched)
- preempt_enable_notrace();
+ preempt_enable_no_resched_notrace();
else
preempt_enable_notrace();
return NULL;
@@ -1234,6 +1294,9 @@ int ring_buffer_write(struct ring_buffer *buffer,
int ret = -EBUSY;
int cpu, resched;
+ if (ring_buffers_off)
+ return -EBUSY;
+
if (atomic_read(&buffer->record_disabled))
return -EBUSY;
@@ -1532,10 +1595,23 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long flags;
+ int nr_loops = 0;
spin_lock_irqsave(&cpu_buffer->lock, flags);
again:
+ /*
+ * This should normally only loop twice. But because the
+ * start of the reader inserts an empty page, it causes
+ * a case where we will loop three times. There should be no
+ * reason to loop four times (that I know of).
+ */
+ if (unlikely(++nr_loops > 3)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ reader = NULL;
+ goto out;
+ }
+
reader = cpu_buffer->reader_page;
/* If there's more to read, return this page */
@@ -1665,6 +1741,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
struct buffer_page *reader;
+ int nr_loops = 0;
if (!cpu_isset(cpu, buffer->cpumask))
return NULL;
@@ -1672,6 +1749,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
cpu_buffer = buffer->buffers[cpu];
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
return NULL;
@@ -1722,6 +1812,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer *buffer;
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_event *event;
+ int nr_loops = 0;
if (ring_buffer_iter_empty(iter))
return NULL;
@@ -1730,6 +1821,19 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
buffer = cpu_buffer->buffer;
again:
+ /*
+ * We repeat when a timestamp is encountered. It is possible
+ * to get multiple timestamps from an interrupt entering just
+ * as one timestamp is about to be written. The max times
+ * that this can happen is the number of nested interrupts we
+ * can have. Nesting 10 deep of interrupts is clearly
+ * an anomaly.
+ */
+ if (unlikely(++nr_loops > 10)) {
+ RB_WARN_ON(cpu_buffer, 1);
+ return NULL;
+ }
+
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
@@ -2014,3 +2118,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
return 0;
}
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int *p = filp->private_data;
+ char buf[64];
+ int r;
+
+ /* !ring_buffers_off == tracing_on */
+ r = sprintf(buf, "%d\n", !*p);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ int *p = filp->private_data;
+ char buf[64];
+ long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ /* !ring_buffers_off == tracing_on */
+ *p = !val;
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+};
+
+
+static __init int rb_init_debugfs(void)
+{
+ struct dentry *d_tracer;
+ struct dentry *entry;
+
+ d_tracer = tracing_init_dentry();
+
+ entry = debugfs_create_file("tracing_on", 0644, d_tracer,
+ &ring_buffers_off, &rb_simple_fops);
+ if (!entry)
+ pr_warning("Could not create debugfs 'tracing_on' entry\n");
+
+ return 0;
+}
+
+fs_initcall(rb_init_debugfs);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a499e2adae..d86e3252f30 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -705,6 +705,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc)
{
+#ifdef CONFIG_STACKTRACE
struct ring_buffer_event *event;
struct stack_entry *entry;
struct stack_trace trace;
@@ -730,6 +731,7 @@ static void ftrace_trace_stack(struct trace_array *tr,
save_stack_trace(&trace);
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
}
void __trace_stack(struct trace_array *tr,
@@ -1086,17 +1088,20 @@ static void s_stop(struct seq_file *m, void *p)
mutex_unlock(&trace_types_lock);
}
-#define KRETPROBE_MSG "[unknown/kretprobe'd]"
-
#ifdef CONFIG_KRETPROBES
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return addr == (unsigned long)kretprobe_trampoline;
+ static const char tramp_name[] = "kretprobe_trampoline";
+ int size = sizeof(tramp_name);
+
+ if (strncmp(tramp_name, name, size) == 0)
+ return "[unknown/kretprobe'd]";
+ return name;
}
#else
-static inline int kretprobed(unsigned long addr)
+static inline const char *kretprobed(const char *name)
{
- return 0;
+ return name;
}
#endif /* CONFIG_KRETPROBES */
@@ -1105,10 +1110,13 @@ seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1119,9 +1127,12 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
{
#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+ const char *name;
sprint_symbol(str, address);
- return trace_seq_printf(s, fmt, str);
+ name = kretprobed(str);
+
+ return trace_seq_printf(s, fmt, name);
#endif
return 1;
}
@@ -1375,10 +1386,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
seq_print_ip_sym(s, field->ip, sym_flags);
trace_seq_puts(s, " (");
- if (kretprobed(field->parent_ip))
- trace_seq_puts(s, KRETPROBE_MSG);
- else
- seq_print_ip_sym(s, field->parent_ip, sym_flags);
+ seq_print_ip_sym(s, field->parent_ip, sym_flags);
trace_seq_puts(s, ")\n");
break;
}
@@ -1494,12 +1502,9 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
ret = trace_seq_printf(s, " <-");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- if (kretprobed(field->parent_ip))
- ret = trace_seq_puts(s, KRETPROBE_MSG);
- else
- ret = seq_print_ip_sym(s,
- field->parent_ip,
- sym_flags);
+ ret = seq_print_ip_sym(s,
+ field->parent_ip,
+ sym_flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
@@ -1750,7 +1755,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
SEQ_PUT_FIELD_RET(s, entry->pid);
- SEQ_PUT_FIELD_RET(s, iter->cpu);
+ SEQ_PUT_FIELD_RET(s, entry->cpu);
SEQ_PUT_FIELD_RET(s, iter->ts);
switch (entry->type) {
@@ -1931,6 +1936,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
ring_buffer_read_finish(iter->buffer_iter[cpu]);
}
mutex_unlock(&trace_types_lock);
+ kfree(iter);
return ERR_PTR(-ENOMEM);
}
@@ -2671,7 +2677,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
{
unsigned long val;
char buf[64];
- int ret;
+ int ret, cpu;
struct trace_array *tr = filp->private_data;
if (cnt >= sizeof(buf))
@@ -2699,6 +2705,14 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
goto out;
}
+ /* disable all cpu buffers */
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_inc(&max_tr.data[cpu]->disabled);
+ }
+
if (val != global_trace.entries) {
ret = ring_buffer_resize(global_trace.buffer, val);
if (ret < 0) {
@@ -2730,6 +2744,13 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
cnt = -ENOMEM;
out:
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_dec(&max_tr.data[cpu]->disabled);
+ }
+
max_tr.entries = global_trace.entries;
mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f28484618ff..e62cbf78eab 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -18,12 +18,14 @@ struct header_iter {
static struct trace_array *mmio_trace_array;
static bool overrun_detected;
+static unsigned long prev_overruns;
static void mmio_reset_data(struct trace_array *tr)
{
int cpu;
overrun_detected = false;
+ prev_overruns = 0;
tr->time_start = ftrace_now(tr->cpu);
for_each_online_cpu(cpu)
@@ -128,16 +130,12 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
- int cpu;
unsigned long cnt = 0;
-/* FIXME: */
-#if 0
- for_each_online_cpu(cpu) {
- cnt += iter->overrun[cpu];
- iter->overrun[cpu] = 0;
- }
-#endif
- (void)cpu;
+ unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+
+ if (over > prev_overruns)
+ cnt = over - prev_overruns;
+ prev_overruns = over;
return cnt;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index be682b62fe5..3bdb44bde4b 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -184,11 +184,16 @@ static struct file_operations stack_max_size_fops = {
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- long i = (long)m->private;
+ long i;
(*pos)++;
- i++;
+ if (v == SEQ_START_TOKEN)
+ i = 0;
+ else {
+ i = *(long *)v;
+ i++;
+ }
if (i >= max_stack_trace.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
@@ -201,12 +206,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- void *t = &m->private;
+ void *t = SEQ_START_TOKEN;
loff_t l = 0;
local_irq_disable();
__raw_spin_lock(&max_stack_lock);
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
+
for (; t && l < *pos; t = t_next(m, t, &l))
;
@@ -235,10 +243,10 @@ static int trace_lookup_stack(struct seq_file *m, long i)
static int t_show(struct seq_file *m, void *v)
{
- long i = *(long *)v;
+ long i;
int size;
- if (i < 0) {
+ if (v == SEQ_START_TOKEN) {
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
@@ -246,6 +254,8 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
+ i = *(long *)v;
+
if (i >= max_stack_trace.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
return 0;
@@ -275,10 +285,6 @@ static int stack_trace_open(struct inode *inode, struct file *file)
int ret;
ret = seq_open(file, &stack_trace_seq_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = (void *)-1;
- }
return ret;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f928f2a87b9..d4dc69ddebd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -970,6 +970,51 @@ undo:
return ret;
}
+#ifdef CONFIG_SMP
+struct work_for_cpu {
+ struct work_struct work;
+ long (*fn)(void *);
+ void *arg;
+ long ret;
+};
+
+static void do_work_for_cpu(struct work_struct *w)
+{
+ struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work);
+
+ wfc->ret = wfc->fn(wfc->arg);
+}
+
+/**
+ * work_on_cpu - run a function in user context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn: the function to run
+ * @arg: the function arg
+ *
+ * This will return -EINVAL in the cpu is not online, or the return value
+ * of @fn otherwise.
+ */
+long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+{
+ struct work_for_cpu wfc;
+
+ INIT_WORK(&wfc.work, do_work_for_cpu);
+ wfc.fn = fn;
+ wfc.arg = arg;
+ get_online_cpus();
+ if (unlikely(!cpu_online(cpu)))
+ wfc.ret = -EINVAL;
+ else {
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
+ }
+ put_online_cpus();
+
+ return wfc.ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu);
+#endif /* CONFIG_SMP */
+
void __init init_workqueues(void)
{
cpu_populated_map = cpu_online_map;