summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c63
-rw-r--r--kernel/auditsc.c168
-rw-r--r--kernel/capability.c8
-rw-r--r--kernel/cpu.c75
-rw-r--r--kernel/cpuset.c59
-rw-r--r--kernel/delayacct.c162
-rw-r--r--kernel/exit.c51
-rw-r--r--kernel/fork.c71
-rw-r--r--kernel/futex.c258
-rw-r--r--kernel/futex_compat.c34
-rw-r--r--kernel/hrtimer.c12
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c23
-rw-r--r--kernel/irq/manage.c34
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c1
-rw-r--r--kernel/kthread.c24
-rw-r--r--kernel/lockdep.c2704
-rw-r--r--kernel/lockdep_internals.h78
-rw-r--r--kernel/lockdep_proc.c345
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/mutex-debug.c399
-rw-r--r--kernel/mutex-debug.h94
-rw-r--r--kernel/mutex.c74
-rw-r--r--kernel/mutex.h19
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/power/Kconfig6
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/power/swap.c26
-rw-r--r--kernel/printk.c31
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/resource.c11
-rw-r--r--kernel/rtmutex-debug.c307
-rw-r--r--kernel/rtmutex-debug.h8
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/rtmutex.c59
-rw-r--r--kernel/rtmutex.h3
-rw-r--r--kernel/rwsem.c147
-rw-r--r--kernel/sched.c869
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/softirq.c165
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/spinlock.c79
-rw-r--r--kernel/stacktrace.c24
-rw-r--r--kernel/stop_machine.c18
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/taskstats.c564
-rw-r--r--kernel/timer.c167
-rw-r--r--kernel/wait.c8
-rw-r--r--kernel/workqueue.c150
61 files changed, 5841 insertions, 1787 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 82fb182f6f6..d62ec66c1af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,10 +8,15 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o
+ hrtimer.o rwsem.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
+obj-$(CONFIG_LOCKDEP) += lockdep.o
+ifeq ($(CONFIG_PROC_FS),y)
+obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
+endif
obj-$(CONFIG_FUTEX) += futex.o
ifeq ($(CONFIG_COMPAT),y)
obj-$(CONFIG_FUTEX) += futex_compat.o
@@ -22,6 +27,7 @@ obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -42,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e..2a7c933651c 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
old_encode_dev(tty_devnum(current->signal->tty)) : 0;
read_unlock(&tasklist_lock);
- spin_lock(&current->sighand->siglock);
+ spin_lock_irq(&current->sighand->siglock);
ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
ac.ac_exitcode = pacct->ac_exitcode;
- spin_unlock(&current->sighand->siglock);
+ spin_unlock_irq(&current->sighand->siglock);
ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
ac.ac_swaps = encode_comp_t(0);
diff --git a/kernel/audit.c b/kernel/audit.c
index d417ca1db79..963fd15c962 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = {
/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
-#ifdef CONFIG_AUDITSYSCALL
int i;
-#endif
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
@@ -717,10 +715,10 @@ static int __init audit_init(void)
audit_ih = inotify_init(&audit_inotify_ops);
if (IS_ERR(audit_ih))
audit_panic("cannot initialize inotify handle");
+#endif
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
-#endif
return 0;
}
@@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
struct sk_buff *skb;
static const unsigned char *hex = "0123456789ABCDEF";
+ if (!ab)
+ return;
+
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
@@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
unsigned char *ptr;
struct sk_buff *skb;
+ if (!ab)
+ return;
+
BUG_ON(!ab->skb);
skb = ab->skb;
avail = skb_tailroom(skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 6aa33b848cf..a3370232a39 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -104,6 +104,7 @@ static inline int audit_hash_ino(u32 ino)
return (ino & (AUDIT_INODE_BUCKETS-1));
}
+extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_compare_dname_path(const char *dname, const char *path,
int *dirlen);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 5b4e16276ca..a44879b0c72 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -302,6 +302,15 @@ int __init audit_register_class(int class, unsigned *list)
return 0;
}
+int audit_match_class(int class, unsigned syscall)
+{
+ if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32)))
+ return 0;
+ if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
+ return 0;
+ return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
+}
+
/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
{
@@ -404,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_PERS:
case AUDIT_ARCH:
case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
case AUDIT_DEVMAJOR:
case AUDIT_DEVMINOR:
case AUDIT_EXIT:
@@ -413,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_ARG2:
case AUDIT_ARG3:
break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
case AUDIT_INODE:
err = audit_to_inode(&entry->rule, f);
if (err)
@@ -442,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
case AUDIT_EQUAL:
break;
default:
+ err = -EINVAL;
goto exit_free;
}
}
@@ -566,6 +581,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ goto exit_free;
+ break;
default:
goto exit_free;
}
@@ -579,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_EQUAL:
break;
default:
+ err = -EINVAL;
goto exit_free;
}
}
@@ -911,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent,
}
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "audit updated rules specifying watch=");
+ audit_log_format(ab, "audit updated rules specifying path=");
audit_log_untrustedstring(ab, owatch->path);
audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
audit_log_end(ab);
@@ -934,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
struct audit_watch *w, *nextw;
struct audit_krule *r, *nextr;
struct audit_entry *e;
+ struct audit_buffer *ab;
mutex_lock(&audit_filter_mutex);
parent->flags |= AUDIT_PARENT_INVALID;
list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, "audit implicitly removed rule path=");
+ audit_log_untrustedstring(ab, w->path);
+ if (r->filterkey) {
+ audit_log_format(ab, " key=");
+ audit_log_untrustedstring(ab, r->filterkey);
+ } else
+ audit_log_format(ab, " key=(null)");
+ audit_log_format(ab, " list=%d", r->listnr);
+ audit_log_end(ab);
+
list_del(&r->rlist);
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
-
- audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
- "audit implicitly removed rule from list=%d\n",
- AUDIT_FILTER_EXIT);
}
audit_remove_watch(w);
}
@@ -1134,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
struct audit_watch *watch = entry->rule.watch;
struct nameidata *ndp, *ndw;
int h, err, putnd_needed = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
if (inode_f) {
h = audit_hash_ino(inode_f->val);
@@ -1174,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
} else {
list_add_tail_rcu(&entry->list, list);
}
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules++;
+#endif
mutex_unlock(&audit_filter_mutex);
if (putnd_needed)
@@ -1198,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry,
struct audit_watch *watch, *tmp_watch = entry->rule.watch;
LIST_HEAD(inotify_list);
int h, ret = 0;
+#ifdef CONFIG_AUDITSYSCALL
+ int dont_count = 0;
+
+ /* If either of these, don't count towards total */
+ if (entry->rule.listnr == AUDIT_FILTER_USER ||
+ entry->rule.listnr == AUDIT_FILTER_TYPE)
+ dont_count = 1;
+#endif
if (inode_f) {
h = audit_hash_ino(inode_f->val);
@@ -1235,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry,
list_del_rcu(&e->list);
call_rcu(&e->rcu, audit_free_rule_rcu);
+#ifdef CONFIG_AUDITSYSCALL
+ if (!dont_count)
+ audit_n_rules--;
+#endif
mutex_unlock(&audit_filter_mutex);
if (!list_empty(&inotify_list))
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ae40ac8c39e..1bd8827a010 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -85,6 +85,9 @@ extern int audit_enabled;
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
+/* number of audit rules */
+int audit_n_rules;
+
/* When fs/namei.c:getname() is called, we store the pointer in name and
* we don't let putname() free it (instead we free all of the saved
* pointers at syscall exit time).
@@ -174,6 +177,7 @@ struct audit_aux_data_path {
/* The per-task audit context. */
struct audit_context {
+ int dummy; /* must be the first element */
int in_syscall; /* 1 if task is in a syscall */
enum audit_state state;
unsigned int serial; /* serial number for record */
@@ -205,6 +209,54 @@ struct audit_context {
#endif
};
+#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
+static inline int open_arg(int flags, int mask)
+{
+ int n = ACC_MODE(flags);
+ if (flags & (O_TRUNC | O_CREAT))
+ n |= AUDIT_PERM_WRITE;
+ return n & mask;
+}
+
+static int audit_match_perm(struct audit_context *ctx, int mask)
+{
+ unsigned n = ctx->major;
+ switch (audit_classify_syscall(ctx->arch, n)) {
+ case 0: /* native */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR, n))
+ return 1;
+ return 0;
+ case 1: /* 32bit on biarch */
+ if ((mask & AUDIT_PERM_WRITE) &&
+ audit_match_class(AUDIT_CLASS_WRITE_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_READ) &&
+ audit_match_class(AUDIT_CLASS_READ_32, n))
+ return 1;
+ if ((mask & AUDIT_PERM_ATTR) &&
+ audit_match_class(AUDIT_CLASS_CHATTR_32, n))
+ return 1;
+ return 0;
+ case 2: /* open */
+ return mask & ACC_MODE(ctx->argv[1]);
+ case 3: /* openat */
+ return mask & ACC_MODE(ctx->argv[2]);
+ case 4: /* socketcall */
+ return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
+ case 5: /* execve */
+ return mask & AUDIT_PERM_EXEC;
+ default:
+ return 0;
+ }
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
@@ -393,6 +445,9 @@ static int audit_filter_rules(struct task_struct *tsk,
/* ignore this field for filtering */
result = 1;
break;
+ case AUDIT_PERM:
+ result = audit_match_perm(ctx, f->val);
+ break;
}
if (!result)
@@ -514,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
context->return_valid = return_valid;
context->return_code = return_code;
- if (context->in_syscall && !context->auditable) {
+ if (context->in_syscall && !context->dummy && !context->auditable) {
enum audit_state state;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
@@ -530,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
}
get_context:
- context->pid = tsk->pid;
- context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
- context->uid = tsk->uid;
- context->gid = tsk->gid;
- context->euid = tsk->euid;
- context->suid = tsk->suid;
- context->fsuid = tsk->fsuid;
- context->egid = tsk->egid;
- context->sgid = tsk->sgid;
- context->fsgid = tsk->fsgid;
- context->personality = tsk->personality;
+
tsk->audit_context = NULL;
return context;
}
@@ -749,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
const char *tty;
/* tsk == current */
+ context->pid = tsk->pid;
+ context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
+ context->uid = tsk->uid;
+ context->gid = tsk->gid;
+ context->euid = tsk->euid;
+ context->suid = tsk->suid;
+ context->fsuid = tsk->fsuid;
+ context->egid = tsk->egid;
+ context->sgid = tsk->sgid;
+ context->fsgid = tsk->fsgid;
+ context->personality = tsk->personality;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
@@ -1066,7 +1122,8 @@ void audit_syscall_entry(int arch, int major,
context->argv[3] = a4;
state = context->state;
- if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+ context->dummy = !audit_n_rules;
+ if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
if (likely(state == AUDIT_DISABLED))
return;
@@ -1199,14 +1256,18 @@ void audit_putname(const char *name)
#endif
}
-static void audit_inode_context(int idx, const struct inode *inode)
+/* Copy inode data into an audit_names. */
+static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
{
- struct audit_context *context = current->audit_context;
-
- selinux_get_inode_sid(inode, &context->names[idx].osid);
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ selinux_get_inode_sid(inode, &name->osid);
}
-
/**
* audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -1240,20 +1301,14 @@ void __audit_inode(const char *name, const struct inode *inode)
++context->ino_count;
#endif
}
- context->names[idx].ino = inode->i_ino;
- context->names[idx].dev = inode->i_sb->s_dev;
- context->names[idx].mode = inode->i_mode;
- context->names[idx].uid = inode->i_uid;
- context->names[idx].gid = inode->i_gid;
- context->names[idx].rdev = inode->i_rdev;
- audit_inode_context(idx, inode);
+ audit_copy_inode(&context->names[idx], inode);
}
/**
* audit_inode_child - collect inode info for created/removed objects
* @dname: inode's dentry name
* @inode: inode being audited
- * @pino: inode number of dentry parent
+ * @parent: inode of dentry parent
*
* For syscalls that create or remove filesystem objects, audit_inode
* can only collect information for the filesystem object's parent.
@@ -1264,7 +1319,7 @@ void __audit_inode(const char *name, const struct inode *inode)
* unsuccessful attempts.
*/
void __audit_inode_child(const char *dname, const struct inode *inode,
- unsigned long pino)
+ const struct inode *parent)
{
int idx;
struct audit_context *context = current->audit_context;
@@ -1278,7 +1333,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
if (!dname)
goto update_context;
for (idx = 0; idx < context->name_count; idx++)
- if (context->names[idx].ino == pino) {
+ if (context->names[idx].ino == parent->i_ino) {
const char *name = context->names[idx].name;
if (!name)
@@ -1302,16 +1357,47 @@ update_context:
context->names[idx].name_len = AUDIT_NAME_FULL;
context->names[idx].name_put = 0; /* don't call __putname() */
- if (inode) {
- context->names[idx].ino = inode->i_ino;
- context->names[idx].dev = inode->i_sb->s_dev;
- context->names[idx].mode = inode->i_mode;
- context->names[idx].uid = inode->i_uid;
- context->names[idx].gid = inode->i_gid;
- context->names[idx].rdev = inode->i_rdev;
- audit_inode_context(idx, inode);
- } else
- context->names[idx].ino = (unsigned long)-1;
+ if (!inode)
+ context->names[idx].ino = (unsigned long)-1;
+ else
+ audit_copy_inode(&context->names[idx], inode);
+
+ /* A parent was not found in audit_names, so copy the inode data for the
+ * provided parent. */
+ if (!found_name) {
+ idx = context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ audit_copy_inode(&context->names[idx], parent);
+ }
+}
+
+/**
+ * audit_inode_update - update inode info for last collected name
+ * @inode: inode being audited
+ *
+ * When open() is called on an existing object with the O_CREAT flag, the inode
+ * data audit initially collects is incorrect. This additional hook ensures
+ * audit has the inode data for the actual object to be opened.
+ */
+void __audit_inode_update(const struct inode *inode)
+{
+ struct audit_context *context = current->audit_context;
+ int idx;
+
+ if (!context->in_syscall || !inode)
+ return;
+
+ if (context->name_count == 0) {
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ }
+ idx = context->name_count - 1;
+
+ audit_copy_inode(&context->names[idx], inode);
}
/**
@@ -1642,7 +1728,7 @@ int audit_bprm(struct linux_binprm *bprm)
unsigned long p, next;
void *to;
- if (likely(!audit_enabled || !context))
+ if (likely(!audit_enabled || !context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
@@ -1680,7 +1766,7 @@ int audit_socketcall(int nargs, unsigned long *args)
struct audit_aux_data_socketcall *ax;
struct audit_context *context = current->audit_context;
- if (likely(!context))
+ if (likely(!context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
@@ -1708,7 +1794,7 @@ int audit_sockaddr(int len, void *a)
struct audit_aux_data_sockaddr *ax;
struct audit_context *context = current->audit_context;
- if (likely(!context))
+ if (likely(!context || context->dummy))
return 0;
ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a4d8a40d3f..c7685ad00a9 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
int ret = 0;
pid_t pid;
__u32 version;
- task_t *target;
+ struct task_struct *target;
struct __user_cap_data_struct data;
if (get_user(version, &header->version))
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
- task_t *g, *target;
+ struct task_struct *g, *target;
int ret = -EPERM;
int found = 0;
@@ -128,7 +128,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
kernel_cap_t *inheritable,
kernel_cap_t *permitted)
{
- task_t *g, *target;
+ struct task_struct *g, *target;
int ret = -EPERM;
int found = 0;
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
kernel_cap_t inheritable, permitted, effective;
__u32 version;
- task_t *target;
+ struct task_struct *target;
int ret;
pid_t pid;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e8376..f230f9ae01c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
#include <linux/mutex.h>
/* This protects CPUs going up and down... */
-static DEFINE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
-static int __lock_cpu_hotplug(int interruptible)
-{
- int ret = 0;
-
- if (lock_cpu_hotplug_owner != current) {
- if (interruptible)
- ret = mutex_lock_interruptible(&cpucontrol);
- else
- mutex_lock(&cpucontrol);
- }
-
- /*
- * Set only if we succeed in locking
- */
- if (!ret) {
- lock_cpu_hotplug_depth++;
- lock_cpu_hotplug_owner = current;
- }
-
- return ret;
-}
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
void lock_cpu_hotplug(void)
{
- __lock_cpu_hotplug(0);
+ struct task_struct *tsk = current;
+
+ if (tsk == recursive) {
+ static int warnings = 10;
+ if (warnings) {
+ printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+ WARN_ON(1);
+ warnings--;
+ }
+ recursive_depth++;
+ return;
+ }
+ mutex_lock(&cpu_bitmask_lock);
+ recursive = tsk;
}
EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
void unlock_cpu_hotplug(void)
{
- if (--lock_cpu_hotplug_depth == 0) {
- lock_cpu_hotplug_owner = NULL;
- mutex_unlock(&cpucontrol);
+ WARN_ON(recursive != current);
+ if (recursive_depth) {
+ recursive_depth--;
+ return;
}
+ mutex_unlock(&cpu_bitmask_lock);
+ recursive = NULL;
}
EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
-int lock_cpu_hotplug_interruptible(void)
-{
- return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
struct task_struct *p;
cpumask_t old_allowed, tmp;
- if ((err = lock_cpu_hotplug_interruptible()) != 0)
- return err;
-
+ mutex_lock(&cpu_add_remove_lock);
if (num_online_cpus() == 1) {
err = -EBUSY;
goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
+ mutex_lock(&cpu_bitmask_lock);
p = __stop_machine_run(take_cpu_down, NULL, cpu);
+ mutex_unlock(&cpu_bitmask_lock);
+
if (IS_ERR(p)) {
/* CPU didn't die: tell everyone. Can't complain. */
if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
out_allowed:
set_cpus_allowed(current, old_allowed);
out:
- unlock_cpu_hotplug();
+ mutex_unlock(&cpu_add_remove_lock);
return err;
}
#endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
int ret;
void *hcpu = (void *)(long)cpu;
- if ((ret = lock_cpu_hotplug_interruptible()) != 0)
- return ret;
-
+ mutex_lock(&cpu_add_remove_lock);
if (cpu_online(cpu) || !cpu_present(cpu)) {
ret = -EINVAL;
goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
}
/* Arch-specific enabling code. */
+ mutex_lock(&cpu_bitmask_lock);
ret = __cpu_up(cpu);
+ mutex_unlock(&cpu_bitmask_lock);
if (ret != 0)
goto out_notify;
BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
blocking_notifier_call_chain(&cpu_chain,
CPU_UP_CANCELED, hcpu);
out:
- unlock_cpu_hotplug();
+ mutex_unlock(&cpu_add_remove_lock);
return ret;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc07743..4ea6f0dc2fc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
*
* Call with manage_mutex held. May nest a call to the
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
*/
static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
if (is_cpu_exclusive(c))
cpus_andnot(pspan, pspan, c->cpus_allowed);
}
- if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+ if (!is_cpu_exclusive(cur)) {
cpus_or(pspan, pspan, cur->cpus_allowed);
if (cpus_equal(pspan, cur->cpus_allowed))
return;
@@ -814,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
struct cpuset trialcs;
int retval, cpus_unchanged;
+ /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = cpulist_parse(buf, trialcs.cpus_allowed);
if (retval < 0)
@@ -1917,6 +1923,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex. Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1953,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&manage_mutex);
return -EBUSY;
}
+ if (is_cpu_exclusive(cs)) {
+ int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+ if (retval < 0) {
+ mutex_unlock(&manage_mutex);
+ return retval;
+ }
+ }
parent = cs->parent;
mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
- if (is_cpu_exclusive(cs))
- update_cpu_domains(cs);
list_del(&cs->sibling); /* delete my sibling from parent->children */
spin_lock(&cs->dentry->d_lock);
d = dget(cs->dentry);
@@ -2015,6 +2037,33 @@ out:
return err;
}
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This handles CPU hotplug (cpuhp) events. If someday Memory
+ * Nodes can be hotplugged (dynamically changing node_online_map)
+ * then we should handle that too, perhaps in a similar way.
+ */
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+ unsigned long phase, void *cpu)
+{
+ mutex_lock(&manage_mutex);
+ mutex_lock(&callback_mutex);
+
+ top_cpuset.cpus_allowed = cpu_online_map;
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&manage_mutex);
+
+ return 0;
+}
+#endif
+
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -2025,6 +2074,8 @@ void __init cpuset_init_smp(void)
{
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
+
+ hotcpu_notifier(cpuset_handle_cpuhp, 0);
}
/**
@@ -2369,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
int cpuset_excl_nodes_overlap(const struct task_struct *p)
{
const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
- int overlap = 0; /* do cpusets overlap? */
+ int overlap = 1; /* do cpusets overlap? */
task_lock(current);
if (current->flags & PF_EXITING) {
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 00000000000..36752f124c6
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,162 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_disable(char *str)
+{
+ delayacct_on = 0;
+ return 1;
+}
+__setup("nodelayacct", delayacct_setup_disable);
+
+void delayacct_init(void)
+{
+ delayacct_cache = kmem_cache_create("delayacct_cache",
+ sizeof(struct task_delay_info),
+ 0,
+ SLAB_PANIC,
+ NULL, NULL);
+ delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+ if (tsk->delays)
+ spin_lock_init(&tsk->delays->lock);
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+ do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+ u64 *total, u32 *count)
+{
+ struct timespec ts;
+ s64 ns;
+
+ do_posix_clock_monotonic_gettime(end);
+ ts = timespec_sub(*end, *start);
+ ns = timespec_to_ns(&ts);
+ if (ns < 0)
+ return;
+
+ spin_lock(&current->delays->lock);
+ *total += ns;
+ (*count)++;
+ spin_unlock(&current->delays->lock);
+}
+
+void __delayacct_blkio_start(void)
+{
+ delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+ if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+ /* Swapin block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+ else /* Other block I/O */
+ delayacct_end(&current->delays->blkio_start,
+ &current->delays->blkio_end,
+ &current->delays->blkio_delay,
+ &current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+ s64 tmp;
+ struct timespec ts;
+ unsigned long t1,t2,t3;
+
+ /* Though tsk->delays accessed later, early exit avoids
+ * unnecessary returning of other data
+ */
+ if (!tsk->delays)
+ goto done;
+
+ tmp = (s64)d->cpu_run_real_total;
+ cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+ tmp += timespec_to_ns(&ts);
+ d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+ /*
+ * No locking available for sched_info (and too expensive to add one)
+ * Mitigate by taking snapshot of values
+ */
+ t1 = tsk->sched_info.pcnt;
+ t2 = tsk->sched_info.run_delay;
+ t3 = tsk->sched_info.cpu_time;
+
+ d->cpu_count += t1;
+
+ jiffies_to_timespec(t2, &ts);
+ tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+ d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+ tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+ d->cpu_run_virtual_total =
+ (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+
+ /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+ spin_lock(&tsk->delays->lock);
+ tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+ d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+ tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+ d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ d->blkio_count += tsk->delays->blkio_count;
+ d->swapin_count += tsk->delays->swapin_count;
+ spin_unlock(&tsk->delays->lock);
+
+done:
+ return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+ __u64 ret;
+
+ spin_lock(&tsk->delays->lock);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+ tsk->delays->swapin_delay);
+ spin_unlock(&tsk->delays->lock);
+ return ret;
+}
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f7ef225855..d891883420f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,8 @@
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
#include <linux/cpuset.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
@@ -134,8 +136,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
void release_task(struct task_struct * p)
{
+ struct task_struct *leader;
int zap_leader;
- task_t *leader;
repeat:
atomic_dec(&p->user->processes);
write_lock_irq(&tasklist_lock);
@@ -209,7 +211,7 @@ out:
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
+static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
{
struct task_struct *p;
int ret = 1;
@@ -582,7 +584,8 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}
-static inline void choose_new_parent(task_t *p, task_t *reaper)
+static inline void
+choose_new_parent(struct task_struct *p, struct task_struct *reaper)
{
/*
* Make sure we're not reparenting to ourselves and that
@@ -592,7 +595,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper)
p->real_parent = reaper;
}
-static void reparent_thread(task_t *p, task_t *father, int traced)
+static void
+reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
{
/* We don't want people slaying init. */
if (p->exit_signal != -1)
@@ -656,8 +660,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
* group, and if no such member exists, give it to
* the global child reaper process (ie "init")
*/
-static void forget_original_parent(struct task_struct * father,
- struct list_head *to_release)
+static void
+forget_original_parent(struct task_struct *father, struct list_head *to_release)
{
struct task_struct *p, *reaper = father;
struct list_head *_p, *_n;
@@ -680,7 +684,7 @@ static void forget_original_parent(struct task_struct * father,
*/
list_for_each_safe(_p, _n, &father->children) {
int ptrace;
- p = list_entry(_p,struct task_struct,sibling);
+ p = list_entry(_p, struct task_struct, sibling);
ptrace = p->ptrace;
@@ -709,7 +713,7 @@ static void forget_original_parent(struct task_struct * father,
list_add(&p->ptrace_list, to_release);
}
list_for_each_safe(_p, _n, &father->ptrace_children) {
- p = list_entry(_p,struct task_struct,ptrace_list);
+ p = list_entry(_p, struct task_struct, ptrace_list);
choose_new_parent(p, reaper);
reparent_thread(p, father, 1);
}
@@ -829,7 +833,7 @@ static void exit_notify(struct task_struct *tsk)
list_for_each_safe(_p, _n, &ptrace_dead) {
list_del_init(_p);
- t = list_entry(_p,struct task_struct,ptrace_list);
+ t = list_entry(_p, struct task_struct, ptrace_list);
release_task(t);
}
@@ -841,7 +845,9 @@ static void exit_notify(struct task_struct *tsk)
fastcall NORET_TYPE void do_exit(long code)
{
struct task_struct *tsk = current;
+ struct taskstats *tidstats;
int group_dead;
+ unsigned int mycpu;
profile_task_exit(tsk);
@@ -879,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code)
current->comm, current->pid,
preempt_count());
+ taskstats_exit_alloc(&tidstats, &mycpu);
+
acct_update_integrals(tsk);
if (tsk->mm) {
update_hiwater_rss(tsk->mm);
@@ -898,6 +906,9 @@ fastcall NORET_TYPE void do_exit(long code)
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
+ taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
+ taskstats_exit_free(tidstats);
+
exit_mm(tsk);
if (group_dead)
@@ -933,10 +944,9 @@ fastcall NORET_TYPE void do_exit(long code)
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
/*
- * If DEBUG_MUTEXES is on, make sure we are holding no locks:
+ * Make sure we are holding no locks:
*/
- mutex_debug_check_no_locks_held(tsk);
- rt_mutex_debug_check_no_locks_held(tsk);
+ debug_check_no_locks_held(tsk);
if (tsk->io_context)
exit_io_context();
@@ -1011,7 +1021,7 @@ asmlinkage void sys_exit_group(int error_code)
do_group_exit((error_code & 0xff) << 8);
}
-static int eligible_child(pid_t pid, int options, task_t *p)
+static int eligible_child(pid_t pid, int options, struct task_struct *p)
{
if (pid > 0) {
if (p->pid != pid)
@@ -1043,7 +1053,7 @@ static int eligible_child(pid_t pid, int options, task_t *p)
* Do not consider thread group leaders that are
* in a non-empty thread group:
*/
- if (current->tgid != p->tgid && delay_group_leader(p))
+ if (delay_group_leader(p))
return 2;
if (security_task_wait(p))
@@ -1052,12 +1062,13 @@ static int eligible_child(pid_t pid, int options, task_t *p)
return 1;
}
-static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
+static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
int why, int status,
struct siginfo __user *infop,
struct rusage __user *rusagep)
{
int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
+
put_task_struct(p);
if (!retval)
retval = put_user(SIGCHLD, &infop->si_signo);
@@ -1082,7 +1093,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(task_t *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int noreap,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1244,8 +1255,8 @@ static int wait_task_zombie(task_t *p, int noreap,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
- struct siginfo __user *infop,
+static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
+ int noreap, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code;
@@ -1359,7 +1370,7 @@ bail_ref:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(task_t *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int noreap,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1445,7 +1456,7 @@ repeat:
int ret;
list_for_each(_p,&tsk->children) {
- p = list_entry(_p,struct task_struct,sibling);
+ p = list_entry(_p, struct task_struct, sibling);
ret = eligible_child(pid, options, p);
if (!ret)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9064bf9e131..f9b014e3e70 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,8 @@
#include <linux/rmap.h>
#include <linux/acct.h>
#include <linux/cn_proc.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -61,9 +63,7 @@ int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
- __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
-
-EXPORT_SYMBOL(tasklist_lock);
+__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
int nr_processes(void)
{
@@ -117,6 +117,7 @@ void __put_task_struct(struct task_struct *tsk)
security_task_free(tsk);
free_uid(tsk->user);
put_group_info(tsk->group_info);
+ delayacct_tsk_free(tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
@@ -193,7 +194,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
down_write(&oldmm->mmap_sem);
flush_cache_mm(oldmm);
- down_write(&mm->mmap_sem);
+ /*
+ * Not linked in yet - no deadlock potential:
+ */
+ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
mm->locked_vm = 0;
mm->mmap = NULL;
@@ -817,6 +821,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
if (clone_flags & CLONE_THREAD) {
atomic_inc(&current->signal->count);
atomic_inc(&current->signal->live);
+ taskstats_tgid_alloc(current->signal);
return 0;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -861,6 +866,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
INIT_LIST_HEAD(&sig->cpu_timers[0]);
INIT_LIST_HEAD(&sig->cpu_timers[1]);
INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ taskstats_tgid_init(sig);
task_lock(current->group_leader);
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -882,6 +888,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
void __cleanup_signal(struct signal_struct *sig)
{
exit_thread_group_keys(sig);
+ taskstats_tgid_free(sig);
kmem_cache_free(signal_cachep, sig);
}
@@ -919,10 +926,6 @@ static inline void rt_mutex_init_task(struct task_struct *p)
spin_lock_init(&p->pi_lock);
plist_head_init(&p->pi_waiters, &p->pi_lock);
p->pi_blocked_on = NULL;
-# ifdef CONFIG_DEBUG_RT_MUTEXES
- spin_lock_init(&p->held_list_lock);
- INIT_LIST_HEAD(&p->held_list_head);
-# endif
#endif
}
@@ -934,13 +937,13 @@ static inline void rt_mutex_init_task(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static task_t *copy_process(unsigned long clone_flags,
- unsigned long stack_start,
- struct pt_regs *regs,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- int pid)
+static struct task_struct *copy_process(unsigned long clone_flags,
+ unsigned long stack_start,
+ struct pt_regs *regs,
+ unsigned long stack_size,
+ int __user *parent_tidptr,
+ int __user *child_tidptr,
+ int pid)
{
int retval;
struct task_struct *p = NULL;
@@ -972,6 +975,10 @@ static task_t *copy_process(unsigned long clone_flags,
if (!p)
goto fork_out;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
+ DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
+#endif
retval = -EAGAIN;
if (atomic_read(&p->user->processes) >=
p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -999,12 +1006,13 @@ static task_t *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid;
retval = -EFAULT;
if (clone_flags & CLONE_PARENT_SETTID)
if (put_user(p->pid, parent_tidptr))
- goto bad_fork_cleanup;
+ goto bad_fork_cleanup_delays_binfmt;
INIT_LIST_HEAD(&p->children);
INIT_LIST_HEAD(&p->sibling);
@@ -1046,6 +1054,26 @@ static task_t *copy_process(unsigned long clone_flags,
}
mpol_fix_fork_child_flag(p);
#endif
+#ifdef CONFIG_TRACE_IRQFLAGS
+ p->irq_events = 0;
+ p->hardirqs_enabled = 0;
+ p->hardirq_enable_ip = 0;
+ p->hardirq_enable_event = 0;
+ p->hardirq_disable_ip = _THIS_IP_;
+ p->hardirq_disable_event = 0;
+ p->softirqs_enabled = 1;
+ p->softirq_enable_ip = _THIS_IP_;
+ p->softirq_enable_event = 0;
+ p->softirq_disable_ip = 0;
+ p->softirq_disable_event = 0;
+ p->hardirq_context = 0;
+ p->softirq_context = 0;
+#endif
+#ifdef CONFIG_LOCKDEP
+ p->lockdep_depth = 0; /* no locks held yet */
+ p->curr_chain_key = 0;
+ p->lockdep_recursion = 0;
+#endif
rt_mutex_init_task(p);
@@ -1250,7 +1278,8 @@ bad_fork_cleanup_policy:
bad_fork_cleanup_cpuset:
#endif
cpuset_exit(p);
-bad_fork_cleanup:
+bad_fork_cleanup_delays_binfmt:
+ delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
bad_fork_cleanup_put_domain:
@@ -1271,9 +1300,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
return regs;
}
-task_t * __devinit fork_idle(int cpu)
+struct task_struct * __devinit fork_idle(int cpu)
{
- task_t *task;
+ struct task_struct *task;
struct pt_regs regs;
task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
@@ -1360,8 +1389,10 @@ long do_fork(unsigned long clone_flags,
if (clone_flags & CLONE_VFORK) {
wait_for_completion(&vfork);
- if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
+ if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
+ current->ptrace_message = nr;
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
+ }
}
} else {
free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index 15caf93e4a4..9d260e838cf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt)
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+ if (attempt > 2 || !(vma = find_vma(mm, address)) ||
vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
return -EFAULT;
@@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
p = NULL;
goto out_unlock;
}
- if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+ if (p->exit_state != 0) {
p = NULL;
goto out_unlock;
}
@@ -415,15 +415,15 @@ out_unlock:
*/
void exit_pi_state_list(struct task_struct *curr)
{
- struct futex_hash_bucket *hb;
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
union futex_key key;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselfs
+ * versus waiters unqueueing themselves:
*/
spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
+ hb = hash_futex(&key);
spin_unlock_irq(&curr->pi_lock);
- hb = hash_futex(&key);
spin_lock(&hb->lock);
spin_lock_irq(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
if (head->next != next) {
spin_unlock(&hb->lock);
continue;
}
- list_del_init(&pi_state->list);
-
WARN_ON(pi_state->owner != curr);
-
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
pi_state->owner = NULL;
spin_unlock_irq(&curr->pi_lock);
@@ -470,12 +473,20 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
- if (match_futex (&this->key, &me->key)) {
+ if (match_futex(&this->key, &me->key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
*/
pi_state = this->pi_state;
+ /*
+ * Userspace might have messed up non PI and PI futexes
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
@@ -484,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
}
/*
- * We are the first waiter - try to look up the real owner and
- * attach the new pi_state to it:
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when the owner died bit is set
+ * and TID = 0:
*/
pid = uval & FUTEX_TID_MASK;
+ if (!pid && (uval & FUTEX_OWNER_DIED))
+ return -ESRCH;
p = futex_find_get_task(pid);
if (!p)
return -ESRCH;
@@ -504,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
pi_state->key = me->key;
spin_lock_irq(&p->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
spin_unlock_irq(&p->pi_lock);
@@ -567,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
* kept enabled while there is PI state around. We must also
* preserve the owner died bit.)
*/
- newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ newval = FUTEX_WAITERS | new_owner->pid;
- inc_preempt_count();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
- dec_preempt_count();
+ inc_preempt_count();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ dec_preempt_count();
+ if (curval == -EFAULT)
+ return -EFAULT;
+ if (curval != uval)
+ return -EINVAL;
+ }
- if (curval == -EFAULT)
- return -EFAULT;
- if (curval != uval)
- return -EINVAL;
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->owner->pi_state_list);
+ spin_lock_irq(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
+ spin_unlock_irq(&new_owner->pi_lock);
+
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
@@ -607,6 +631,22 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
}
/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 <= hb2) {
+ spin_lock(&hb1->lock);
+ if (hb1 < hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ } else { /* hb1 > hb2 */
+ spin_lock(&hb2->lock);
+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
@@ -674,11 +714,7 @@ retryfull:
hb2 = hash_futex(&key2);
retry:
- if (hb1 < hb2)
- spin_lock(&hb1->lock);
- spin_lock(&hb2->lock);
- if (hb1 > hb2)
- spin_lock(&hb1->lock);
+ double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
@@ -711,8 +747,10 @@ retry:
*/
if (attempt++) {
if (futex_handle_fault((unsigned long)uaddr2,
- attempt))
+ attempt)) {
+ ret = -EFAULT;
goto out;
+ }
goto retry;
}
@@ -787,11 +825,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
- if (hb1 < hb2)
- spin_lock(&hb1->lock);
- spin_lock(&hb2->lock);
- if (hb1 > hb2)
- spin_lock(&hb1->lock);
+ double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
u32 curval;
@@ -916,6 +950,7 @@ static int unqueue_me(struct futex_q *q)
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
+ barrier();
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
@@ -1085,9 +1120,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
- struct hrtimer_sleeper *to)
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+ long nsec, int trylock)
{
+ struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
@@ -1097,6 +1133,13 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
if (refill_pi_state_cache())
return -ENOMEM;
+ if (sec != MAX_SCHEDULE_TIMEOUT) {
+ to = &timeout;
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
+ hrtimer_init_sleeper(to, current);
+ to->timer.expires = ktime_set(sec, nsec);
+ }
+
q.pi_state = NULL;
retry:
down_read(&curr->mm->mmap_sem);
@@ -1222,6 +1265,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
/* Owner died? */
if (q.pi_state->owner != NULL) {
spin_lock_irq(&q.pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&q.pi_state->list));
list_del_init(&q.pi_state->list);
spin_unlock_irq(&q.pi_state->owner->pi_lock);
} else
@@ -1230,6 +1274,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
q.pi_state->owner = current;
spin_lock_irq(&current->pi_lock);
+ WARN_ON(!list_empty(&q.pi_state->list));
list_add(&q.pi_state->list, &current->pi_state_list);
spin_unlock_irq(&current->pi_lock);
@@ -1270,7 +1315,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
if (!detect && ret == -EDEADLK && 0)
force_sig(SIGKILL, current);
- return ret;
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
out_unlock_release_sem:
queue_unlock(&q, hb);
@@ -1287,9 +1332,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt))
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
goto out_unlock_release_sem;
-
+ }
goto retry_locked;
}
@@ -1304,76 +1350,6 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
}
/*
- * Restart handler
- */
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- int ret;
-
- restart->fn = do_no_restart_syscall;
-
- if (restart->arg2 || restart->arg3) {
- to = &timeout;
- hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
- hrtimer_init_sleeper(to, current);
- to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
- (u64) restart->arg0;
- }
-
- pr_debug("lock_pi restart: %p, %d (%d)\n",
- (u32 __user *)restart->arg0, current->pid);
-
- ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
- 0, to);
-
- if (ret != -EINTR)
- return ret;
-
- restart->fn = futex_lock_pi_restart;
-
- /* The other values are filled in */
- return -ERESTART_RESTARTBLOCK;
-}
-
-/*
- * Called from the syscall entry below.
- */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- struct restart_block *restart;
- int ret;
-
- if (sec != MAX_SCHEDULE_TIMEOUT) {
- to = &timeout;
- hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
- hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
- }
-
- ret = do_futex_lock_pi(uaddr, detect, trylock, to);
-
- if (ret != -EINTR)
- return ret;
-
- pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
-
- restart = &current_thread_info()->restart_block;
- restart->fn = futex_lock_pi_restart;
- restart->arg0 = (unsigned long) uaddr;
- restart->arg1 = detect;
- if (to) {
- restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg3 = to->timer.expires.tv64 >> 32;
- } else
- restart->arg2 = restart->arg3 = 0;
-
- return -ERESTART_RESTARTBLOCK;
-}
-
-/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
@@ -1413,9 +1389,11 @@ retry_locked:
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- inc_preempt_count();
- uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
- dec_preempt_count();
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ inc_preempt_count();
+ uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+ dec_preempt_count();
+ }
if (unlikely(uval == -EFAULT))
goto pi_faulted;
@@ -1448,9 +1426,11 @@ retry_locked:
/*
* No waiters - kernel unlocks the futex:
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ }
out_unlock:
spin_unlock(&hb->lock);
@@ -1467,9 +1447,10 @@ pi_faulted:
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt))
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
goto out_unlock;
-
+ }
goto retry_locked;
}
@@ -1669,9 +1650,9 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval, nval;
+ u32 uval, nval, mval;
retry:
if (get_user(uval, uaddr))
@@ -1688,21 +1669,45 @@ retry:
* thread-death.) The rest of the cleanup is done in
* userspace.
*/
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
- uval | FUTEX_OWNER_DIED);
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
if (nval == -EFAULT)
return -1;
if (nval != uval)
goto retry;
- if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi) {
+ if (uval & FUTEX_WAITERS)
+ futex_wake(uaddr, 1);
+ }
}
return 0;
}
/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user **head, int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long *)head))
+ return -EFAULT;
+
+ *entry = (void *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
@@ -1712,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned long futex_offset;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (get_user(entry, &head->list.next))
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
@@ -1730,10 +1735,11 @@ void exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (get_user(pending, &head->list_op_pending))
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
return;
+
if (pending)
- handle_futex_death((void *)pending + futex_offset, curr);
+ handle_futex_death((void *)pending + futex_offset, curr, pip);
while (entry != &head->list) {
/*
@@ -1742,12 +1748,12 @@ void exit_robust_list(struct task_struct *curr)
*/
if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset,
- curr))
+ curr, pi))
return;
/*
* Fetch the next entry in the list:
*/
- if (get_user(entry, &entry->next))
+ if (fetch_robust_entry(&entry, &entry->next, &pi))
return;
/*
* Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb..c5cca3f65cb 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
#include <asm/uaccess.h>
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t *head, int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
{
struct compat_robust_list_head __user *head = curr->compat_robust_list;
struct robust_list __user *entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
compat_uptr_t uentry, upending;
- unsigned int limit = ROBUST_LIST_LIMIT;
compat_long_t futex_offset;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (get_user(uentry, &head->list.next))
+ if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
return;
- entry = compat_ptr(uentry);
/*
* Fetch the relative futex offset:
*/
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (get_user(upending, &head->list_op_pending))
+ if (fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
return;
- pending = compat_ptr(upending);
if (upending)
- handle_futex_death((void *)pending + futex_offset, curr);
+ handle_futex_death((void *)pending + futex_offset, curr, pip);
while (compat_ptr(uentry) != &head->list) {
/*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
*/
if (entry != pending)
if (handle_futex_death((void *)entry + futex_offset,
- curr))
+ curr, pi))
return;
/*
* Fetch the next entry in the list:
*/
- if (get_user(uentry, (compat_uptr_t *)&entry->next))
+ if (fetch_robust_entry(&uentry, &entry,
+ (compat_uptr_t *)&entry->next, &pi))
return;
- entry = compat_ptr(uentry);
/*
* Avoid excessively long or circular lists:
*/
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 8d3dc29ef41..21c38a7e666 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -187,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
{
struct hrtimer_base *new_base;
- new_base = &__get_cpu_var(hrtimer_bases[base->index]);
+ new_base = &__get_cpu_var(hrtimer_bases)[base->index];
if (base != new_base) {
/*
@@ -669,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task)
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
{
sl->timer.function = hrtimer_wakeup;
sl->task = task;
@@ -782,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu)
struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
int i;
- for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
+ for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
spin_lock_init(&base->lock);
+ lockdep_set_class(&base->lock, &base->lock_key);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -833,7 +835,7 @@ static void migrate_hrtimers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -857,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block __cpuinitdata hrtimers_nb = {
.notifier_call = hrtimer_cpu_notify,
};
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 54105bdfe20..9336f2e89e4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -261,10 +261,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
* keep it masked and get out of here
*/
action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
+ desc->status |= IRQ_PENDING;
goto out;
+ }
desc->status |= IRQ_INPROGRESS;
+ desc->status &= ~IRQ_PENDING;
spin_unlock(&desc->lock);
action_ret = handle_IRQ_event(irq, regs, action);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aeb6e391276..48a53f68af9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -20,6 +20,11 @@
/**
* handle_bad_irq - handle spurious and unhandled irqs
+ * @irq: the interrupt number
+ * @desc: description of the interrupt
+ * @regs: pointer to a register structure
+ *
+ * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
void fastcall
handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
@@ -132,7 +137,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
handle_dynamic_tick(action);
if (!(action->flags & IRQF_DISABLED))
- local_irq_enable();
+ local_irq_enable_in_hardirq();
do {
ret = action->handler(irq, action->dev_id, regs);
@@ -249,3 +254,19 @@ out:
return 1;
}
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+static struct lock_class_key irq_desc_lock_class;
+
+void early_init_irq_lock_class(void)
+{
+ int i;
+
+ for (i = 0; i < NR_IRQS; i++)
+ lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
+}
+
+#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c911c6ec4dd..92be519eff2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
* @irq: interrupt to control
* @on: enable/disable power management wakeup
*
- * Enable/disable power management wakeup mode
+ * Enable/disable power management wakeup mode, which is
+ * disabled by default. Enables and disables must match,
+ * just as they match for non-wakeup mode support.
+ *
+ * Wakeup mode lets this IRQ wake the system from sleep
+ * states like "suspend to RAM".
*/
int set_irq_wake(unsigned int irq, unsigned int on)
{
struct irq_desc *desc = irq_desc + irq;
unsigned long flags;
int ret = -ENXIO;
+ int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
+ /* wakeup-capable irqs can be shared between drivers that
+ * don't need to have the same sleep mode behaviors.
+ */
spin_lock_irqsave(&desc->lock, flags);
- if (desc->chip->set_wake)
+ if (on) {
+ if (desc->wake_depth++ == 0)
+ desc->status |= IRQ_WAKEUP;
+ else
+ set_wake = NULL;
+ } else {
+ if (desc->wake_depth == 0) {
+ printk(KERN_WARNING "Unbalanced IRQ %d "
+ "wake disable\n", irq);
+ WARN_ON(1);
+ } else if (--desc->wake_depth == 0)
+ desc->status &= ~IRQ_WAKEUP;
+ else
+ set_wake = NULL;
+ }
+ if (set_wake)
ret = desc->chip->set_wake(irq, on);
spin_unlock_irqrestore(&desc->lock, flags);
return ret;
@@ -410,6 +434,12 @@ int request_irq(unsigned int irq,
struct irqaction *action;
int retval;
+#ifdef CONFIG_LOCKDEP
+ /*
+ * Lockdep wants atomic interrupt handlers:
+ */
+ irqflags |= SA_INTERRUPT;
+#endif
/*
* Sanity-check: shared interrupts must pass in a real dev-ID,
* otherwise we'll have trouble later trying to figure out
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 872f91ba2ce..35f10f7ff94 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -63,8 +63,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
desc->chip->enable(irq);
if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
- desc->status &= ~IRQ_PENDING;
- desc->status = status | IRQ_REPLAY;
+ desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
if (!desc->chip || !desc->chip->retrigger ||
!desc->chip->retrigger(irq)) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf9..ab16a5a4cfe 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value,
- &iter->type, iter->name);
+ &iter->value, &iter->type,
+ iter->name, sizeof(iter->name));
if (iter->owner == NULL)
return 0;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1b7157af051..5c470c57fb5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -197,11 +197,12 @@ static void __call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
pid_t pid;
+ int wait = sub_info->wait;
/* CLONE_VFORK: wait until the usermode helper has execve'd
* successfully We need the data structures to stay around
* until that is done. */
- if (sub_info->wait)
+ if (wait)
pid = kernel_thread(wait_for_helper, sub_info,
CLONE_FS | CLONE_FILES | SIGCHLD);
else
@@ -211,7 +212,7 @@ static void __call_usermodehelper(void *data)
if (pid < 0) {
sub_info->retval = pid;
complete(sub_info->complete);
- } else if (!sub_info->wait)
+ } else if (!wait)
complete(sub_info->complete);
}
@@ -233,7 +234,7 @@ static void __call_usermodehelper(void *data)
int call_usermodehelper_keys(char *path, char **argv, char **envp,
struct key *session_keyring, int wait)
{
- DECLARE_COMPLETION(done);
+ DECLARE_COMPLETION_ONSTACK(done);
struct subprocess_info sub_info = {
.complete = &done,
.path = path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153..3f57dfdc8f9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
copy_kprobe(p, ap);
+ flush_insn_slot(ap);
ap->addr = p->addr;
ap->pre_handler = aggr_pre_handler;
ap->fault_handler = aggr_fault_handler;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c..4f9c60ef95e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind);
*/
int kthread_stop(struct task_struct *k)
{
- return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-
-/**
- * kthread_stop_sem - stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or %-EINTR if wake_up_process()
- * was never called.
- */
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
-{
int ret;
mutex_lock(&kthread_stop_lock);
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
/* Now set kthread_should_stop() to true, and wake it up. */
kthread_stop_info.k = k;
- if (s)
- up(s);
- else
- wake_up_process(k);
+ wake_up_process(k);
put_task_struct(k);
/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
return ret;
}
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
static __init int helper_init(void)
{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 00000000000..9bad1788451
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,2704 @@
+/*
+ * kernel/lockdep.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * this code maps all the lock dependencies as they occur in a live kernel
+ * and will warn about the following classes of locking bugs:
+ *
+ * - lock inversion scenarios
+ * - circular lock dependencies
+ * - hardirq/softirq safe/unsafe locking bugs
+ *
+ * Bugs are reported even if the current locking scenario does not cause
+ * any deadlock at this point.
+ *
+ * I.e. if anytime in the past two locks were taken in a different order,
+ * even if it happened for another task, even if those were different
+ * locks (but of the same class as this lock), this code will detect it.
+ *
+ * Thanks to Arjan van de Ven for coming up with the initial idea of
+ * mapping lock dependencies runtime.
+ */
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/interrupt.h>
+#include <linux/stacktrace.h>
+#include <linux/debug_locks.h>
+#include <linux/irqflags.h>
+
+#include <asm/sections.h>
+
+#include "lockdep_internals.h"
+
+/*
+ * hash_lock: protects the lockdep hashes and class/list/hash allocators.
+ *
+ * This is one of the rare exceptions where it's justified
+ * to use a raw spinlock - we really dont want the spinlock
+ * code to recurse back into the lockdep code.
+ */
+static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+static int lockdep_initialized;
+
+unsigned long nr_list_entries;
+static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
+
+/*
+ * Allocate a lockdep entry. (assumes hash_lock held, returns
+ * with NULL on failure)
+ */
+static struct lock_list *alloc_list_entry(void)
+{
+ if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ return list_entries + nr_list_entries++;
+}
+
+/*
+ * All data structures here are protected by the global debug_lock.
+ *
+ * Mutex key structs only get allocated, once during bootup, and never
+ * get freed - this significantly simplifies the debugging code.
+ */
+unsigned long nr_lock_classes;
+static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+
+/*
+ * We keep a global list of all lock classes. The list only grows,
+ * never shrinks. The list is only accessed with the lockdep
+ * spinlock lock held.
+ */
+LIST_HEAD(all_lock_classes);
+
+/*
+ * The lockdep classes are in a hash-table as well, for fast lookup:
+ */
+#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
+#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
+#define CLASSHASH_MASK (CLASSHASH_SIZE - 1)
+#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
+#define classhashentry(key) (classhash_table + __classhashfn((key)))
+
+static struct list_head classhash_table[CLASSHASH_SIZE];
+
+unsigned long nr_lock_chains;
+static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+
+/*
+ * We put the lock dependency chains into a hash-table as well, to cache
+ * their existence:
+ */
+#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
+#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
+#define CHAINHASH_MASK (CHAINHASH_SIZE - 1)
+#define __chainhashfn(chain) \
+ (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
+#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
+
+static struct list_head chainhash_table[CHAINHASH_SIZE];
+
+/*
+ * The hash key of the lock dependency chains is a hash itself too:
+ * it's a hash of all locks taken up to that lock, including that lock.
+ * It's a 64-bit hash, because it's important for the keys to be
+ * unique.
+ */
+#define iterate_chain_key(key1, key2) \
+ (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
+ ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
+ (key2))
+
+void lockdep_off(void)
+{
+ current->lockdep_recursion++;
+}
+
+EXPORT_SYMBOL(lockdep_off);
+
+void lockdep_on(void)
+{
+ current->lockdep_recursion--;
+}
+
+EXPORT_SYMBOL(lockdep_on);
+
+int lockdep_internal(void)
+{
+ return current->lockdep_recursion != 0;
+}
+
+EXPORT_SYMBOL(lockdep_internal);
+
+/*
+ * Debugging switches:
+ */
+
+#define VERBOSE 0
+#ifdef VERBOSE
+# define VERY_VERBOSE 0
+#endif
+
+#if VERBOSE
+# define HARDIRQ_VERBOSE 1
+# define SOFTIRQ_VERBOSE 1
+#else
+# define HARDIRQ_VERBOSE 0
+# define SOFTIRQ_VERBOSE 0
+#endif
+
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
+/*
+ * Quick filtering for interesting events:
+ */
+static int class_filter(struct lock_class *class)
+{
+#if 0
+ /* Example */
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "lockname"))
+ return 1;
+ if (class->name_version == 1 &&
+ !strcmp(class->name, "&struct->lockfield"))
+ return 1;
+#endif
+ /* Allow everything else. 0 would be filter everything else */
+ return 1;
+}
+#endif
+
+static int verbose(struct lock_class *class)
+{
+#if VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+static int hardirq_verbose(struct lock_class *class)
+{
+#if HARDIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+static int softirq_verbose(struct lock_class *class)
+{
+#if SOFTIRQ_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+
+#endif
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+unsigned long nr_stack_trace_entries;
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
+static int save_trace(struct stack_trace *trace)
+{
+ trace->nr_entries = 0;
+ trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
+ trace->entries = stack_trace + nr_stack_trace_entries;
+
+ save_stack_trace(trace, NULL, 0, 3);
+
+ trace->max_entries = trace->nr_entries;
+
+ nr_stack_trace_entries += trace->nr_entries;
+ if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
+ return 0;
+
+ if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
+ __raw_spin_unlock(&hash_lock);
+ if (debug_locks_off()) {
+ printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+ }
+ return 0;
+ }
+
+ return 1;
+}
+
+unsigned int nr_hardirq_chains;
+unsigned int nr_softirq_chains;
+unsigned int nr_process_chains;
+unsigned int max_lockdep_depth;
+unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * We cannot printk in early bootup code. Not even early_printk()
+ * might work. So we mark any initialization errors and printk
+ * about it later on, in lockdep_info().
+ */
+static int lockdep_init_error;
+
+/*
+ * Various lockdep statistics:
+ */
+atomic_t chain_lookup_hits;
+atomic_t chain_lookup_misses;
+atomic_t hardirqs_on_events;
+atomic_t hardirqs_off_events;
+atomic_t redundant_hardirqs_on;
+atomic_t redundant_hardirqs_off;
+atomic_t softirqs_on_events;
+atomic_t softirqs_off_events;
+atomic_t redundant_softirqs_on;
+atomic_t redundant_softirqs_off;
+atomic_t nr_unused_locks;
+atomic_t nr_cyclic_checks;
+atomic_t nr_cyclic_check_recursions;
+atomic_t nr_find_usage_forwards_checks;
+atomic_t nr_find_usage_forwards_recursions;
+atomic_t nr_find_usage_backwards_checks;
+atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
+
+/*
+ * Locking printouts:
+ */
+
+static const char *usage_str[] =
+{
+ [LOCK_USED] = "initial-use ",
+ [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
+ [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
+ [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
+ [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
+ [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
+ [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
+ [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
+ [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
+};
+
+const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+{
+ unsigned long offs, size;
+ char *modname;
+
+ return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+}
+
+void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
+{
+ *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
+
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ *c1 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ *c1 = '-';
+
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ *c2 = '+';
+ else
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ *c2 = '-';
+
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
+ *c3 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ *c3 = '?';
+ }
+
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '-';
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
+ *c4 = '+';
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ *c4 = '?';
+ }
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char str[128], c1, c2, c3, c4;
+ const char *name;
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ printk(" (%s", name);
+ } else {
+ printk(" (%s", name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ if (class->subclass)
+ printk("/%d", class->subclass);
+ }
+ printk("){%c%c%c%c}", c1, c2, c3, c4);
+}
+
+static void print_lockdep_cache(struct lockdep_map *lock)
+{
+ const char *name;
+ char str[128];
+
+ name = lock->name;
+ if (!name)
+ name = __get_key_name(lock->key->subkeys, str);
+
+ printk("%s", name);
+}
+
+static void print_lock(struct held_lock *hlock)
+{
+ print_lock_name(hlock->class);
+ printk(", at: ");
+ print_ip_sym(hlock->acquire_ip);
+}
+
+static void lockdep_print_held_locks(struct task_struct *curr)
+{
+ int i, depth = curr->lockdep_depth;
+
+ if (!depth) {
+ printk("no locks held by %s/%d.\n", curr->comm, curr->pid);
+ return;
+ }
+ printk("%d lock%s held by %s/%d:\n",
+ depth, depth > 1 ? "s" : "", curr->comm, curr->pid);
+
+ for (i = 0; i < depth; i++) {
+ printk(" #%d: ", i);
+ print_lock(curr->held_locks + i);
+ }
+}
+
+static void print_lock_class_header(struct lock_class *class, int depth)
+{
+ int bit;
+
+ printk("%*s->", depth, "");
+ print_lock_name(class);
+ printk(" ops: %lu", class->ops);
+ printk(" {\n");
+
+ for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ if (class->usage_mask & (1 << bit)) {
+ int len = depth;
+
+ len += printk("%*s %s", depth, "", usage_str[bit]);
+ len += printk(" at:\n");
+ print_stack_trace(class->usage_traces + bit, len);
+ }
+ }
+ printk("%*s }\n", depth, "");
+
+ printk("%*s ... key at: ",depth,"");
+ print_ip_sym((unsigned long)class->key);
+}
+
+/*
+ * printk all lock dependencies starting at <entry>:
+ */
+static void print_lock_dependencies(struct lock_class *class, int depth)
+{
+ struct lock_list *entry;
+
+ if (DEBUG_LOCKS_WARN_ON(depth >= 20))
+ return;
+
+ print_lock_class_header(class, depth);
+
+ list_for_each_entry(entry, &class->locks_after, entry) {
+ DEBUG_LOCKS_WARN_ON(!entry->class);
+ print_lock_dependencies(entry->class, depth + 1);
+
+ printk("%*s ... acquired at:\n",depth,"");
+ print_stack_trace(&entry->trace, 2);
+ printk("\n");
+ }
+}
+
+/*
+ * Add a new dependency to the head of the list:
+ */
+static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
+ struct list_head *head, unsigned long ip)
+{
+ struct lock_list *entry;
+ /*
+ * Lock not present yet - get a new dependency struct and
+ * add it to the list:
+ */
+ entry = alloc_list_entry();
+ if (!entry)
+ return 0;
+
+ entry->class = this;
+ save_trace(&entry->trace);
+
+ /*
+ * Since we never remove from the dependency list, the list can
+ * be walked lockless by other CPUs, it's only allocation
+ * that must be protected by the spinlock. But this also means
+ * we must make new entries visible only once writes to the
+ * entry become visible - hence the RCU op:
+ */
+ list_add_tail_rcu(&entry->entry, head);
+
+ return 1;
+}
+
+/*
+ * Recursive, forwards-direction lock-dependency checking, used for
+ * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
+ * checking.
+ *
+ * (to keep the stackframe of the recursive functions small we
+ * use these global variables, and we also mark various helper
+ * functions as noinline.)
+ */
+static struct held_lock *check_source, *check_target;
+
+/*
+ * Print a dependency chain entry (this is only done when a deadlock
+ * has been detected):
+ */
+static noinline int
+print_circular_bug_entry(struct lock_list *target, unsigned int depth)
+{
+ if (debug_locks_silent)
+ return 0;
+ printk("\n-> #%u", depth);
+ print_lock_name(target->class);
+ printk(":\n");
+ print_stack_trace(&target->trace, 6);
+
+ return 0;
+}
+
+/*
+ * When a circular dependency is detected, print the
+ * header first:
+ */
+static noinline int
+print_circular_bug_header(struct lock_list *entry, unsigned int depth)
+{
+ struct task_struct *curr = current;
+
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=======================================================\n");
+ printk( "[ INFO: possible circular locking dependency detected ]\n");
+ printk( "-------------------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, curr->pid);
+ print_lock(check_source);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(check_target);
+ printk("\nwhich lock already depends on the new lock.\n\n");
+ printk("\nthe existing dependency chain (in reverse order) is:\n");
+
+ print_circular_bug_entry(entry, depth);
+
+ return 0;
+}
+
+static noinline int print_circular_bug_tail(void)
+{
+ struct task_struct *curr = current;
+ struct lock_list this;
+
+ if (debug_locks_silent)
+ return 0;
+
+ this.class = check_source->class;
+ save_trace(&this.trace);
+ print_circular_bug_entry(&this, 0);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int noinline print_infinite_recursion_bug(void)
+{
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+
+ return 0;
+}
+
+/*
+ * Prove that the dependency graph starting at <entry> can not
+ * lead to <target>. Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+
+ debug_atomic_inc(&nr_cyclic_check_recursions);
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ if (entry->class == check_target->class)
+ return print_circular_bug_header(entry, depth+1);
+ debug_atomic_inc(&nr_cyclic_checks);
+ if (!check_noncircular(entry->class, depth+1))
+ return print_circular_bug_entry(entry, depth+1);
+ }
+ return 1;
+}
+
+static int very_verbose(struct lock_class *class)
+{
+#if VERY_VERBOSE
+ return class_filter(class);
+#endif
+ return 0;
+}
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * Forwards and backwards subgraph searching, for the purposes of
+ * proving that two subgraphs can be connected by a new dependency
+ * without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ */
+static enum lock_usage_bit find_usage_bit;
+static struct lock_class *forwards_match, *backwards_match;
+
+/*
+ * Find a node in the forwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <forwards_match>.
+ *
+ * Return 1 otherwise and keep <forwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_forwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_forwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ forwards_match = source;
+ return 2;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_after, entry) {
+ debug_atomic_inc(&nr_find_usage_forwards_recursions);
+ ret = find_usage_forwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+/*
+ * Find a node in the backwards-direction dependency sub-graph starting
+ * at <source> that matches <find_usage_bit>.
+ *
+ * Return 2 if such a node exists in the subgraph, and put that node
+ * into <backwards_match>.
+ *
+ * Return 1 otherwise and keep <backwards_match> unchanged.
+ * Return 0 on error.
+ */
+static noinline int
+find_usage_backwards(struct lock_class *source, unsigned int depth)
+{
+ struct lock_list *entry;
+ int ret;
+
+ if (depth > max_recursion_depth)
+ max_recursion_depth = depth;
+ if (depth >= 20)
+ return print_infinite_recursion_bug();
+
+ debug_atomic_inc(&nr_find_usage_backwards_checks);
+ if (source->usage_mask & (1 << find_usage_bit)) {
+ backwards_match = source;
+ return 2;
+ }
+
+ /*
+ * Check this lock's dependency list:
+ */
+ list_for_each_entry(entry, &source->locks_before, entry) {
+ debug_atomic_inc(&nr_find_usage_backwards_recursions);
+ ret = find_usage_backwards(entry->class, depth+1);
+ if (ret == 2 || ret == 0)
+ return ret;
+ }
+ return 1;
+}
+
+static int
+print_bad_irq_dependency(struct task_struct *curr,
+ struct held_lock *prev,
+ struct held_lock *next,
+ enum lock_usage_bit bit1,
+ enum lock_usage_bit bit2,
+ const char *irqclass)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n======================================================\n");
+ printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ irqclass, irqclass);
+ printk( "------------------------------------------------------\n");
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
+ curr->comm, curr->pid,
+ curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+ curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
+ curr->hardirqs_enabled,
+ curr->softirqs_enabled);
+ print_lock(next);
+
+ printk("\nand this task is already holding:\n");
+ print_lock(prev);
+ printk("which would create a new lock dependency:\n");
+ print_lock_name(prev->class);
+ printk(" ->");
+ print_lock_name(next->class);
+ printk("\n");
+
+ printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
+ irqclass);
+ print_lock_name(backwards_match);
+ printk("\n... which became %s-irq-safe at:\n", irqclass);
+
+ print_stack_trace(backwards_match->usage_traces + bit1, 1);
+
+ printk("\nto a %s-irq-unsafe lock:\n", irqclass);
+ print_lock_name(forwards_match);
+ printk("\n... which became %s-irq-unsafe at:\n", irqclass);
+ printk("...");
+
+ print_stack_trace(forwards_match->usage_traces + bit2, 1);
+
+ printk("\nother info that might help us debug this:\n\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(backwards_match, 0);
+
+ printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
+ print_lock_dependencies(forwards_match, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int
+check_usage(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next, enum lock_usage_bit bit_backwards,
+ enum lock_usage_bit bit_forwards, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit_backwards;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(prev->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ find_usage_bit = bit_forwards;
+ ret = find_usage_forwards(next->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+ /* ret == 2 */
+ return print_bad_irq_dependency(curr, prev, next,
+ bit_backwards, bit_forwards, irqclass);
+}
+
+#endif
+
+static int
+print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ debug_locks_off();
+ __raw_spin_unlock(&hash_lock);
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=============================================\n");
+ printk( "[ INFO: possible recursive locking detected ]\n");
+ printk( "---------------------------------------------\n");
+ printk("%s/%d is trying to acquire lock:\n",
+ curr->comm, curr->pid);
+ print_lock(next);
+ printk("\nbut task is already holding lock:\n");
+ print_lock(prev);
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Check whether we are holding such a class already.
+ *
+ * (Note that this has to be done separately, because the graph cannot
+ * detect such classes of deadlocks.)
+ *
+ * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ */
+static int
+check_deadlock(struct task_struct *curr, struct held_lock *next,
+ struct lockdep_map *next_instance, int read)
+{
+ struct held_lock *prev;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ prev = curr->held_locks + i;
+ if (prev->class != next->class)
+ continue;
+ /*
+ * Allow read-after-read recursion of the same
+ * lock class (i.e. read_lock(lock)+read_lock(lock)):
+ */
+ if ((read == 2) && prev->read)
+ return 2;
+ return print_deadlock_bug(curr, prev, next);
+ }
+ return 1;
+}
+
+/*
+ * There was a chain-cache miss, and we are about to add a new dependency
+ * to a previous lock. We recursively validate the following rules:
+ *
+ * - would the adding of the <prev> -> <next> dependency create a
+ * circular dependency in the graph? [== circular deadlock]
+ *
+ * - does the new prev->next dependency connect any hardirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * hardirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with hardirq contexts]
+ *
+ * - does the new prev->next dependency connect any softirq-safe lock
+ * (in the full backwards-subgraph starting at <prev>) with any
+ * softirq-unsafe lock (in the full forwards-subgraph starting at
+ * <next>)? [== illegal lock inversion with softirq contexts]
+ *
+ * any of these scenarios could lead to a deadlock.
+ *
+ * Then if all the validations pass, we add the forwards and backwards
+ * dependency.
+ */
+static int
+check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ struct held_lock *next)
+{
+ struct lock_list *entry;
+ int ret;
+
+ /*
+ * Prove that the new <prev> -> <next> dependency would not
+ * create a circular dependency in the graph. (We do this by
+ * forward-recursing into the graph starting at <next>, and
+ * checking whether we can reach <prev>.)
+ *
+ * We are using global variables to control the recursion, to
+ * keep the stackframe size of the recursive functions low:
+ */
+ check_source = next;
+ check_target = prev;
+ if (!(check_noncircular(next->class, 0)))
+ return print_circular_bug_tail();
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a hardirq-safe-read
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
+ LOCK_ENABLED_HARDIRQS, "hard-read"))
+ return 0;
+
+ /*
+ * Prove that the new dependency does not connect a softirq-safe
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ /*
+ * Prove that the new dependency does not connect a softirq-safe-read
+ * lock with a softirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
+ if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+#endif
+ /*
+ * For recursive read-locks we do all the dependency checks,
+ * but we dont store read-triggered dependencies (only
+ * write-triggered dependencies). This ensures that only the
+ * write-side dependencies matter, and that if for example a
+ * write-lock never takes any other locks, then the reads are
+ * equivalent to a NOP.
+ */
+ if (next->read == 2 || prev->read == 2)
+ return 1;
+ /*
+ * Is the <prev> -> <next> dependency already present?
+ *
+ * (this may occur even though this is a new chain: consider
+ * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
+ * chains - the second one will be new, but L1 already has
+ * L2 added to its dependency list, due to the first chain.)
+ */
+ list_for_each_entry(entry, &prev->class->locks_after, entry) {
+ if (entry->class == next->class)
+ return 2;
+ }
+
+ /*
+ * Ok, all validations passed, add the new lock
+ * to the previous lock's dependency list:
+ */
+ ret = add_lock_to_list(prev->class, next->class,
+ &prev->class->locks_after, next->acquire_ip);
+ if (!ret)
+ return 0;
+ /*
+ * Return value of 2 signals 'dependency already added',
+ * in that case we dont have to add the backlink either.
+ */
+ if (ret == 2)
+ return 2;
+ ret = add_lock_to_list(next->class, prev->class,
+ &next->class->locks_before, next->acquire_ip);
+
+ /*
+ * Debugging printouts:
+ */
+ if (verbose(prev->class) || verbose(next->class)) {
+ __raw_spin_unlock(&hash_lock);
+ printk("\n new dependency: ");
+ print_lock_name(prev->class);
+ printk(" => ");
+ print_lock_name(next->class);
+ printk("\n");
+ dump_stack();
+ __raw_spin_lock(&hash_lock);
+ }
+ return 1;
+}
+
+/*
+ * Add the dependency to all directly-previous locks that are 'relevant'.
+ * The ones that are relevant are (in increasing distance from curr):
+ * all consecutive trylock entries and the final non-trylock entry - or
+ * the end of this context's lock-chain - whichever comes first.
+ */
+static int
+check_prevs_add(struct task_struct *curr, struct held_lock *next)
+{
+ int depth = curr->lockdep_depth;
+ struct held_lock *hlock;
+
+ /*
+ * Debugging checks.
+ *
+ * Depth must not be zero for a non-head lock:
+ */
+ if (!depth)
+ goto out_bug;
+ /*
+ * At least two relevant locks must exist for this
+ * to be a head:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ goto out_bug;
+
+ for (;;) {
+ hlock = curr->held_locks + depth-1;
+ /*
+ * Only non-recursive-read entries get new dependencies
+ * added:
+ */
+ if (hlock->read != 2) {
+ check_prev_add(curr, hlock, next);
+ /*
+ * Stop after the first non-trylock entry,
+ * as non-trylock entries have added their
+ * own direct dependencies already, so this
+ * lock is connected to them indirectly:
+ */
+ if (!hlock->trylock)
+ break;
+ }
+ depth--;
+ /*
+ * End of lock-stack?
+ */
+ if (!depth)
+ break;
+ /*
+ * Stop the search if we cross into another context:
+ */
+ if (curr->held_locks[depth].irq_context !=
+ curr->held_locks[depth-1].irq_context)
+ break;
+ }
+ return 1;
+out_bug:
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+
+ return 0;
+}
+
+
+/*
+ * Is this the address of a static object:
+ */
+static int static_obj(void *obj)
+{
+ unsigned long start = (unsigned long) &_stext,
+ end = (unsigned long) &_end,
+ addr = (unsigned long) obj;
+#ifdef CONFIG_SMP
+ int i;
+#endif
+
+ /*
+ * static variable?
+ */
+ if ((addr >= start) && (addr < end))
+ return 1;
+
+#ifdef CONFIG_SMP
+ /*
+ * percpu var?
+ */
+ for_each_possible_cpu(i) {
+ start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
+ end = (unsigned long) &__per_cpu_end + per_cpu_offset(i);
+
+ if ((addr >= start) && (addr < end))
+ return 1;
+ }
+#endif
+
+ /*
+ * module var?
+ */
+ return is_module_address(addr);
+}
+
+/*
+ * To make lock name printouts unique, we calculate a unique
+ * class->name_version generation counter:
+ */
+static int count_matching_names(struct lock_class *new_class)
+{
+ struct lock_class *class;
+ int count = 0;
+
+ if (!new_class->name)
+ return 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ if (new_class->key - new_class->subclass == class->key)
+ return class->name_version;
+ if (class->name && !strcmp(class->name, new_class->name))
+ count = max(count, class->name_version);
+ }
+
+ return count + 1;
+}
+
+extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * If the architecture calls into lockdep before initializing
+ * the hashes then we'll warn about it later. (we cannot printk
+ * right now)
+ */
+ if (unlikely(!lockdep_initialized)) {
+ lockdep_init();
+ lockdep_init_error = 1;
+ }
+#endif
+
+ /*
+ * Static locks do not have their class-keys yet - for them the key
+ * is the lock object itself:
+ */
+ if (unlikely(!lock->key))
+ lock->key = (void *)lock;
+
+ /*
+ * NOTE: the class-key must be unique. For dynamic locks, a static
+ * lock_class_key variable is passed in through the mutex_init()
+ * (or spin_lock_init()) call - which acts as the key. For static
+ * locks we use the lock object itself as the key.
+ */
+ if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
+ __error_too_big_MAX_LOCKDEP_SUBCLASSES();
+
+ key = lock->key->subkeys + subclass;
+
+ hash_head = classhashentry(key);
+
+ /*
+ * We can walk the hash lockfree, because the hash only
+ * grows, and we are careful when adding entries to the end:
+ */
+ list_for_each_entry(class, hash_head, hash_entry)
+ if (class->key == key)
+ return class;
+
+ return NULL;
+}
+
+/*
+ * Register a lock's class in the hash-table, if the class is not present
+ * yet. Otherwise we look it up. We cache the result in the lock object
+ * itself, so actual lookup of the hash should be once per lock object.
+ */
+static inline struct lock_class *
+register_lock_class(struct lockdep_map *lock, unsigned int subclass)
+{
+ struct lockdep_subclass_key *key;
+ struct list_head *hash_head;
+ struct lock_class *class;
+
+ class = look_up_lock_class(lock, subclass);
+ if (likely(class))
+ return class;
+
+ /*
+ * Debug-check: all keys must be persistent!
+ */
+ if (!static_obj(lock->key)) {
+ debug_locks_off();
+ printk("INFO: trying to register non-static key.\n");
+ printk("the code is fine but needs lockdep annotation.\n");
+ printk("turning off the locking correctness validator.\n");
+ dump_stack();
+
+ return NULL;
+ }
+
+ key = lock->key->subkeys + subclass;
+ hash_head = classhashentry(key);
+
+ __raw_spin_lock(&hash_lock);
+ /*
+ * We have to do the hash-walk again, to avoid races
+ * with another CPU:
+ */
+ list_for_each_entry(class, hash_head, hash_entry)
+ if (class->key == key)
+ goto out_unlock_set;
+ /*
+ * Allocate a new key from the static array, and add it to
+ * the hash:
+ */
+ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return NULL;
+ }
+ class = lock_classes + nr_lock_classes++;
+ debug_atomic_inc(&nr_unused_locks);
+ class->key = key;
+ class->name = lock->name;
+ class->subclass = subclass;
+ INIT_LIST_HEAD(&class->lock_entry);
+ INIT_LIST_HEAD(&class->locks_before);
+ INIT_LIST_HEAD(&class->locks_after);
+ class->name_version = count_matching_names(class);
+ /*
+ * We use RCU's safe list-add method to make
+ * parallel walking of the hash-list safe:
+ */
+ list_add_tail_rcu(&class->hash_entry, hash_head);
+
+ if (verbose(class)) {
+ __raw_spin_unlock(&hash_lock);
+ printk("\nnew class %p: %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ __raw_spin_lock(&hash_lock);
+ }
+out_unlock_set:
+ __raw_spin_unlock(&hash_lock);
+
+ if (!subclass)
+ lock->class_cache = class;
+
+ DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
+
+ return class;
+}
+
+/*
+ * Look up a dependency chain. If the key is not present yet then
+ * add it and return 0 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 1.
+ */
+static inline int lookup_chain_cache(u64 chain_key)
+{
+ struct list_head *hash_head = chainhashentry(chain_key);
+ struct lock_chain *chain;
+
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ /*
+ * We can walk it lock-free, because entries only get added
+ * to the hash:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+cache_hit:
+ debug_atomic_inc(&chain_lookup_hits);
+ /*
+ * In the debugging case, force redundant checking
+ * by returning 1:
+ */
+#ifdef CONFIG_DEBUG_LOCKDEP
+ __raw_spin_lock(&hash_lock);
+ return 1;
+#endif
+ return 0;
+ }
+ }
+ /*
+ * Allocate a new chain entry from the static array, and add
+ * it to the hash:
+ */
+ __raw_spin_lock(&hash_lock);
+ /*
+ * We have to walk the chain again locked - to avoid duplicates:
+ */
+ list_for_each_entry(chain, hash_head, entry) {
+ if (chain->chain_key == chain_key) {
+ __raw_spin_unlock(&hash_lock);
+ goto cache_hit;
+ }
+ }
+ if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+ chain = lock_chains + nr_lock_chains++;
+ chain->chain_key = chain_key;
+ list_add_tail_rcu(&chain->entry, hash_head);
+ debug_atomic_inc(&chain_lookup_misses);
+#ifdef CONFIG_TRACE_IRQFLAGS
+ if (current->hardirq_context)
+ nr_hardirq_chains++;
+ else {
+ if (current->softirq_context)
+ nr_softirq_chains++;
+ else
+ nr_process_chains++;
+ }
+#else
+ nr_process_chains++;
+#endif
+
+ return 1;
+}
+
+/*
+ * We are building curr_chain_key incrementally, so double-check
+ * it from scratch, to make sure that it's done correctly:
+ */
+static void check_chain_key(struct task_struct *curr)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ struct held_lock *hlock, *prev_hlock = NULL;
+ unsigned int i, id;
+ u64 chain_key = 0;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+ if (chain_key != hlock->prev_chain_key) {
+ debug_locks_off();
+ printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)hlock->prev_chain_key);
+ WARN_ON(1);
+ return;
+ }
+ id = hlock->class - lock_classes;
+ DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS);
+ if (prev_hlock && (prev_hlock->irq_context !=
+ hlock->irq_context))
+ chain_key = 0;
+ chain_key = iterate_chain_key(chain_key, id);
+ prev_hlock = hlock;
+ }
+ if (chain_key != curr->curr_chain_key) {
+ debug_locks_off();
+ printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
+ curr->lockdep_depth, i,
+ (unsigned long long)chain_key,
+ (unsigned long long)curr->curr_chain_key);
+ WARN_ON(1);
+ }
+#endif
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+
+/*
+ * print irq inversion bug:
+ */
+static int
+print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
+ struct held_lock *this, int forwards,
+ const char *irqclass)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=========================================================\n");
+ printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ printk( "---------------------------------------------------------\n");
+ printk("%s/%d just changed the state of lock:\n",
+ curr->comm, curr->pid);
+ print_lock(this);
+ if (forwards)
+ printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+ else
+ printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+ print_lock_name(other);
+ printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nthe first lock's dependencies:\n");
+ print_lock_dependencies(this->class, 0);
+
+ printk("\nthe second lock's dependencies:\n");
+ print_lock_dependencies(other, 0);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Prove that in the forwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_forwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <forwards_match> */
+ ret = find_usage_forwards(this->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
+}
+
+/*
+ * Prove that in the backwards-direction subgraph starting at <this>
+ * there is no lock matching <mask>:
+ */
+static int
+check_usage_backwards(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit bit, const char *irqclass)
+{
+ int ret;
+
+ find_usage_bit = bit;
+ /* fills in <backwards_match> */
+ ret = find_usage_backwards(this->class, 0);
+ if (!ret || ret == 1)
+ return ret;
+
+ return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
+}
+
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+ printk("irq event stamp: %u\n", curr->irq_events);
+ printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
+ print_ip_sym(curr->hardirq_enable_ip);
+ printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
+ print_ip_sym(curr->hardirq_disable_ip);
+ printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
+ print_ip_sym(curr->softirq_enable_ip);
+ printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
+ print_ip_sym(curr->softirq_disable_ip);
+}
+
+#else
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+static int
+print_usage_bug(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
+{
+ __raw_spin_unlock(&hash_lock);
+ debug_locks_off();
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=================================\n");
+ printk( "[ INFO: inconsistent lock state ]\n");
+ printk( "---------------------------------\n");
+
+ printk("inconsistent {%s} -> {%s} usage.\n",
+ usage_str[prev_bit], usage_str[new_bit]);
+
+ printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
+ curr->comm, curr->pid,
+ trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+ trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+ trace_hardirqs_enabled(curr),
+ trace_softirqs_enabled(curr));
+ print_lock(this);
+
+ printk("{%s} state was registered at:\n", usage_str[prev_bit]);
+ print_stack_trace(this->class->usage_traces + prev_bit, 1);
+
+ print_irqtrace_events(curr);
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Print out an error if an invalid bit is set:
+ */
+static inline int
+valid_state(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
+{
+ if (unlikely(this->class->usage_mask & (1 << bad_bit)))
+ return print_usage_bug(curr, this, bad_bit, new_bit);
+ return 1;
+}
+
+#define STRICT_READ_CHECKS 1
+
+/*
+ * Mark a lock with a usage bit, and validate the state transition:
+ */
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit, unsigned long ip)
+{
+ unsigned int new_mask = 1 << new_bit, ret = 1;
+
+ /*
+ * If already set then do not dirty the cacheline,
+ * nor do any checks:
+ */
+ if (likely(this->class->usage_mask & new_mask))
+ return 1;
+
+ __raw_spin_lock(&hash_lock);
+ /*
+ * Make sure we didnt race:
+ */
+ if (unlikely(this->class->usage_mask & new_mask)) {
+ __raw_spin_unlock(&hash_lock);
+ return 1;
+ }
+
+ this->class->usage_mask |= new_mask;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ if (new_bit == LOCK_ENABLED_HARDIRQS ||
+ new_bit == LOCK_ENABLED_HARDIRQS_READ)
+ ip = curr->hardirq_enable_ip;
+ else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
+ new_bit == LOCK_ENABLED_SOFTIRQS_READ)
+ ip = curr->softirq_enable_ip;
+#endif
+ if (!save_trace(this->class->usage_traces + new_bit))
+ return 0;
+
+ switch (new_bit) {
+#ifdef CONFIG_TRACE_IRQFLAGS
+ case LOCK_USED_IN_HARDIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_HARDIRQS_READ))
+ return 0;
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-safe, check that this lock
+ * took no hardirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_ENABLED_SOFTIRQS_READ))
+ return 0;
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-safe, check that this lock
+ * took no softirq-unsafe-read lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_HARDIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
+ return 0;
+ /*
+ * just marked it hardirq-read-safe, check that this lock
+ * took no hardirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_HARDIRQS, "hard"))
+ return 0;
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_USED_IN_SOFTIRQ_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
+ return 0;
+ /*
+ * just marked it softirq-read-safe, check that this lock
+ * took no softirq-unsafe lock in the past:
+ */
+ if (!check_usage_forwards(curr, this,
+ LOCK_ENABLED_SOFTIRQS, "soft"))
+ return 0;
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_HARDIRQ_READ))
+ return 0;
+ /*
+ * just marked it hardirq-unsafe, check that no hardirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-unsafe, check that no
+ * hardirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+ if (!valid_state(curr, this, new_bit,
+ LOCK_USED_IN_SOFTIRQ_READ))
+ return 0;
+ /*
+ * just marked it softirq-unsafe, check that no softirq-safe
+ * lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-unsafe, check that no
+ * softirq-safe-read lock in the system ever took
+ * it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_HARDIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it hardirq-read-unsafe, check that no
+ * hardirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_HARDIRQ, "hard"))
+ return 0;
+#endif
+ if (hardirq_verbose(this->class))
+ ret = 2;
+ break;
+ case LOCK_ENABLED_SOFTIRQS_READ:
+ if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
+ return 0;
+#if STRICT_READ_CHECKS
+ /*
+ * just marked it softirq-read-unsafe, check that no
+ * softirq-safe lock in the system ever took it in the past:
+ */
+ if (!check_usage_backwards(curr, this,
+ LOCK_USED_IN_SOFTIRQ, "soft"))
+ return 0;
+#endif
+ if (softirq_verbose(this->class))
+ ret = 2;
+ break;
+#endif
+ case LOCK_USED:
+ /*
+ * Add it to the global list of classes:
+ */
+ list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
+ debug_atomic_dec(&nr_unused_locks);
+ break;
+ default:
+ debug_locks_off();
+ WARN_ON(1);
+ return 0;
+ }
+
+ __raw_spin_unlock(&hash_lock);
+
+ /*
+ * We must printk outside of the hash_lock:
+ */
+ if (ret == 2) {
+ printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
+ print_lock(this);
+ print_irqtrace_events(curr);
+ dump_stack();
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+/*
+ * Mark all held locks with a usage bit:
+ */
+static int
+mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+{
+ enum lock_usage_bit usage_bit;
+ struct held_lock *hlock;
+ int i;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ if (hardirq) {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_HARDIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_HARDIRQS;
+ } else {
+ if (hlock->read)
+ usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
+ else
+ usage_bit = LOCK_ENABLED_SOFTIRQS;
+ }
+ if (!mark_lock(curr, hlock, usage_bit, ip))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * Debugging helper: via this flag we know that we are in
+ * 'early bootup code', and will warn about any invalid irqs-on event:
+ */
+static int early_boot_irqs_enabled;
+
+void early_boot_irqs_off(void)
+{
+ early_boot_irqs_enabled = 0;
+}
+
+void early_boot_irqs_on(void)
+{
+ early_boot_irqs_enabled = 1;
+}
+
+/*
+ * Hardirqs will be enabled:
+ */
+void trace_hardirqs_on(void)
+{
+ struct task_struct *curr = current;
+ unsigned long ip;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+ return;
+
+ if (unlikely(curr->hardirqs_enabled)) {
+ debug_atomic_inc(&redundant_hardirqs_on);
+ return;
+ }
+ /* we'll do an OFF -> ON transition: */
+ curr->hardirqs_enabled = 1;
+ ip = (unsigned long) __builtin_return_address(0);
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+ return;
+ /*
+ * We are going to turn hardirqs on, so set the
+ * usage bit for all held locks:
+ */
+ if (!mark_held_locks(curr, 1, ip))
+ return;
+ /*
+ * If we have softirqs enabled, then set the usage
+ * bit for all held locks. (disabled hardirqs prevented
+ * this bit from being set before)
+ */
+ if (curr->softirqs_enabled)
+ if (!mark_held_locks(curr, 0, ip))
+ return;
+
+ curr->hardirq_enable_ip = ip;
+ curr->hardirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_on_events);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+/*
+ * Hardirqs were disabled:
+ */
+void trace_hardirqs_off(void)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks || current->lockdep_recursion))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->hardirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->hardirqs_enabled = 0;
+ curr->hardirq_disable_ip = _RET_IP_;
+ curr->hardirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&hardirqs_off_events);
+ } else
+ debug_atomic_inc(&redundant_hardirqs_off);
+}
+
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+/*
+ * Softirqs will be enabled:
+ */
+void trace_softirqs_on(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ debug_atomic_inc(&redundant_softirqs_on);
+ return;
+ }
+
+ /*
+ * We'll do an OFF -> ON transition:
+ */
+ curr->softirqs_enabled = 1;
+ curr->softirq_enable_ip = ip;
+ curr->softirq_enable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_on_events);
+ /*
+ * We are going to turn softirqs on, so set the
+ * usage bit for all held locks, if hardirqs are
+ * enabled too:
+ */
+ if (curr->hardirqs_enabled)
+ mark_held_locks(curr, 0, ip);
+}
+
+/*
+ * Softirqs were disabled:
+ */
+void trace_softirqs_off(unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return;
+
+ if (curr->softirqs_enabled) {
+ /*
+ * We have done an ON -> OFF transition:
+ */
+ curr->softirqs_enabled = 0;
+ curr->softirq_disable_ip = ip;
+ curr->softirq_disable_event = ++curr->irq_events;
+ debug_atomic_inc(&softirqs_off_events);
+ DEBUG_LOCKS_WARN_ON(!softirq_count());
+ } else
+ debug_atomic_inc(&redundant_softirqs_off);
+}
+
+#endif
+
+/*
+ * Initialize a lock instance's lock-class mapping info:
+ */
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key)
+{
+ if (unlikely(!debug_locks))
+ return;
+
+ if (DEBUG_LOCKS_WARN_ON(!key))
+ return;
+ if (DEBUG_LOCKS_WARN_ON(!name))
+ return;
+ /*
+ * Sanity check, the lock-class key must be persistent:
+ */
+ if (!static_obj(key)) {
+ printk("BUG: key %p not in .data!\n", key);
+ DEBUG_LOCKS_WARN_ON(1);
+ return;
+ }
+ lock->name = name;
+ lock->key = key;
+ lock->class_cache = NULL;
+}
+
+EXPORT_SYMBOL_GPL(lockdep_init_map);
+
+/*
+ * This gets called for every mutex_lock*()/spin_lock*() operation.
+ * We maintain the dependency maps and validate the locking attempt:
+ */
+static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, int hardirqs_off,
+ unsigned long ip)
+{
+ struct task_struct *curr = current;
+ struct lock_class *class = NULL;
+ struct held_lock *hlock;
+ unsigned int depth, id;
+ int chain_head = 0;
+ u64 chain_key;
+
+ if (unlikely(!debug_locks))
+ return 0;
+
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+
+ if (!subclass)
+ class = lock->class_cache;
+ /*
+ * Not cached yet or subclass?
+ */
+ if (unlikely(!class)) {
+ class = register_lock_class(lock, subclass);
+ if (!class)
+ return 0;
+ }
+ debug_atomic_inc((atomic_t *)&class->ops);
+ if (very_verbose(class)) {
+ printk("\nacquire class [%p] %s", class->key, class->name);
+ if (class->name_version > 1)
+ printk("#%d", class->name_version);
+ printk("\n");
+ dump_stack();
+ }
+
+ /*
+ * Add the lock to the list of currently held locks.
+ * (we dont increase the depth just yet, up until the
+ * dependency checks are done)
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
+ return 0;
+
+ hlock = curr->held_locks + depth;
+
+ hlock->class = class;
+ hlock->acquire_ip = ip;
+ hlock->instance = lock;
+ hlock->trylock = trylock;
+ hlock->read = read;
+ hlock->check = check;
+ hlock->hardirqs_off = hardirqs_off;
+
+ if (check != 2)
+ goto out_calc_hash;
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * If non-trylock use in a hardirq or softirq context, then
+ * mark the lock as used in these contexts:
+ */
+ if (!trylock) {
+ if (read) {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_HARDIRQ_READ, ip))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock,
+ LOCK_USED_IN_SOFTIRQ_READ, ip))
+ return 0;
+ } else {
+ if (curr->hardirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+ return 0;
+ if (curr->softirq_context)
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+ return 0;
+ }
+ }
+ if (!hardirqs_off) {
+ if (read) {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS_READ, ip))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS_READ, ip))
+ return 0;
+ } else {
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_HARDIRQS, ip))
+ return 0;
+ if (curr->softirqs_enabled)
+ if (!mark_lock(curr, hlock,
+ LOCK_ENABLED_SOFTIRQS, ip))
+ return 0;
+ }
+ }
+#endif
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED, ip))
+ return 0;
+out_calc_hash:
+ /*
+ * Calculate the chain hash: it's the combined has of all the
+ * lock keys along the dependency chain. We save the hash value
+ * at every step so that we can get the current hash easily
+ * after unlock. The chain hash is then used to cache dependency
+ * results.
+ *
+ * The 'key ID' is what is the most compact key value to drive
+ * the hash, not class->key.
+ */
+ id = class - lock_classes;
+ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+ return 0;
+
+ chain_key = curr->curr_chain_key;
+ if (!depth) {
+ if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ return 0;
+ chain_head = 1;
+ }
+
+ hlock->prev_chain_key = chain_key;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ /*
+ * Keep track of points where we cross into an interrupt context:
+ */
+ hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
+ curr->softirq_context;
+ if (depth) {
+ struct held_lock *prev_hlock;
+
+ prev_hlock = curr->held_locks + depth-1;
+ /*
+ * If we cross into another context, reset the
+ * hash key (this also prevents the checking and the
+ * adding of the dependency to 'prev'):
+ */
+ if (prev_hlock->irq_context != hlock->irq_context) {
+ chain_key = 0;
+ chain_head = 1;
+ }
+ }
+#endif
+ chain_key = iterate_chain_key(chain_key, id);
+ curr->curr_chain_key = chain_key;
+
+ /*
+ * Trylock needs to maintain the stack of held locks, but it
+ * does not add new dependencies, because trylock can be done
+ * in any order.
+ *
+ * We look up the chain_key and do the O(N^2) check and update of
+ * the dependencies only if this is a new dependency chain.
+ * (If lookup_chain_cache() returns with 1 it acquires
+ * hash_lock for us)
+ */
+ if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
+ /*
+ * Check whether last held lock:
+ *
+ * - is irq-safe, if this lock is irq-unsafe
+ * - is softirq-safe, if this lock is hardirq-unsafe
+ *
+ * And check whether the new lock's dependency graph
+ * could lead back to the previous lock.
+ *
+ * any of these scenarios could lead to a deadlock. If
+ * All validations
+ */
+ int ret = check_deadlock(curr, hlock, lock, read);
+
+ if (!ret)
+ return 0;
+ /*
+ * Mark recursive read, as we jump over it when
+ * building dependencies (just like we jump over
+ * trylock entries):
+ */
+ if (ret == 2)
+ hlock->read = 2;
+ /*
+ * Add dependency only if this lock is not the head
+ * of the chain, and if it's not a secondary read-lock:
+ */
+ if (!chain_head && ret != 2)
+ if (!check_prevs_add(curr, hlock))
+ return 0;
+ __raw_spin_unlock(&hash_lock);
+ }
+ curr->lockdep_depth++;
+ check_chain_key(curr);
+ if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
+ debug_locks_off();
+ printk("BUG: MAX_LOCK_DEPTH too low!\n");
+ printk("turning off the locking correctness validator.\n");
+ return 0;
+ }
+ if (unlikely(curr->lockdep_depth > max_lockdep_depth))
+ max_lockdep_depth = curr->lockdep_depth;
+
+ return 1;
+}
+
+static int
+print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: bad unlock balance detected! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is trying to release lock (",
+ curr->comm, curr->pid);
+ print_lockdep_cache(lock);
+ printk(") at:\n");
+ print_ip_sym(ip);
+ printk("but there are no more locks to release!\n");
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+/*
+ * Common debugging checks for both nested and non-nested unlock:
+ */
+static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
+ unsigned long ip)
+{
+ if (unlikely(!debug_locks))
+ return 0;
+ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+ return 0;
+
+ if (curr->lockdep_depth <= 0)
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks in a
+ * potentially non-nested (out of order) manner. This is a
+ * relatively rare operation, as all the unlock APIs default
+ * to nested mode (which uses lock_release()):
+ */
+static int
+lock_release_non_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock, *prev_hlock;
+ unsigned int depth;
+ int i;
+
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
+ depth = curr->lockdep_depth;
+ if (DEBUG_LOCKS_WARN_ON(!depth))
+ return 0;
+
+ prev_hlock = NULL;
+ for (i = depth-1; i >= 0; i--) {
+ hlock = curr->held_locks + i;
+ /*
+ * We must not cross into another context:
+ */
+ if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
+ break;
+ if (hlock->instance == lock)
+ goto found_it;
+ prev_hlock = hlock;
+ }
+ return print_unlock_inbalance_bug(curr, lock, ip);
+
+found_it:
+ /*
+ * We have the right lock to unlock, 'hlock' points to it.
+ * Now we remove it from the stack, and add back the other
+ * entries (if any), recalculating the hash along the way:
+ */
+ curr->lockdep_depth = i;
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+ for (i++; i < depth; i++) {
+ hlock = curr->held_locks + i;
+ if (!__lock_acquire(hlock->instance,
+ hlock->class->subclass, hlock->trylock,
+ hlock->read, hlock->check, hlock->hardirqs_off,
+ hlock->acquire_ip))
+ return 0;
+ }
+
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
+ return 0;
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static int lock_release_nested(struct task_struct *curr,
+ struct lockdep_map *lock, unsigned long ip)
+{
+ struct held_lock *hlock;
+ unsigned int depth;
+
+ /*
+ * Pop off the top of the lock stack:
+ */
+ depth = curr->lockdep_depth - 1;
+ hlock = curr->held_locks + depth;
+
+ /*
+ * Is the unlock non-nested:
+ */
+ if (hlock->instance != lock)
+ return lock_release_non_nested(curr, lock, ip);
+ curr->lockdep_depth--;
+
+ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
+ return 0;
+
+ curr->curr_chain_key = hlock->prev_chain_key;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ hlock->prev_chain_key = 0;
+ hlock->class = NULL;
+ hlock->acquire_ip = 0;
+ hlock->irq_context = 0;
+#endif
+ return 1;
+}
+
+/*
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()). This is done for unlocks that nest
+ * perfectly. (i.e. the current top of the lock-stack is unlocked)
+ */
+static void
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ struct task_struct *curr = current;
+
+ if (!check_unlock(curr, lock, ip))
+ return;
+
+ if (nested) {
+ if (!lock_release_nested(curr, lock, ip))
+ return;
+ } else {
+ if (!lock_release_non_nested(curr, lock, ip))
+ return;
+ }
+
+ check_chain_key(curr);
+}
+
+/*
+ * Check whether we follow the irq-flags state precisely:
+ */
+static void check_flags(unsigned long flags)
+{
+#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
+ if (!debug_locks)
+ return;
+
+ if (irqs_disabled_flags(flags))
+ DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled);
+ else
+ DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled);
+
+ /*
+ * We dont accurately track softirq state in e.g.
+ * hardirq contexts (such as on 4KSTACKS), so only
+ * check if not in hardirq contexts:
+ */
+ if (!hardirq_count()) {
+ if (softirq_count())
+ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
+ else
+ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
+
+ if (!debug_locks)
+ print_irqtrace_events(current);
+#endif
+}
+
+/*
+ * We are not always called with irqs disabled - do that here,
+ * and also avoid lockdep recursion:
+ */
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_acquire(lock, subclass, trylock, read, check,
+ irqs_disabled_flags(flags), ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_acquire);
+
+void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+ current->lockdep_recursion = 1;
+ __lock_release(lock, nested, ip);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(lock_release);
+
+/*
+ * Used by the testsuite, sanitize the validator state
+ * after a simulated failure:
+ */
+
+void lockdep_reset(void)
+{
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ current->curr_chain_key = 0;
+ current->lockdep_depth = 0;
+ current->lockdep_recursion = 0;
+ memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
+ nr_hardirq_chains = 0;
+ nr_softirq_chains = 0;
+ nr_process_chains = 0;
+ debug_locks = 1;
+ raw_local_irq_restore(flags);
+}
+
+static void zap_class(struct lock_class *class)
+{
+ int i;
+
+ /*
+ * Remove all dependencies this lock is
+ * involved in:
+ */
+ for (i = 0; i < nr_list_entries; i++) {
+ if (list_entries[i].class == class)
+ list_del_rcu(&list_entries[i].entry);
+ }
+ /*
+ * Unhash the class and remove it from the all_lock_classes list:
+ */
+ list_del_rcu(&class->hash_entry);
+ list_del_rcu(&class->lock_entry);
+
+}
+
+static inline int within(void *addr, void *start, unsigned long size)
+{
+ return addr >= start && addr < start + size;
+}
+
+void lockdep_free_key_range(void *start, unsigned long size)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&hash_lock);
+
+ /*
+ * Unhash all classes that were created by this module:
+ */
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry)
+ if (within(class->key, start, size))
+ zap_class(class);
+ }
+
+ __raw_spin_unlock(&hash_lock);
+ raw_local_irq_restore(flags);
+}
+
+void lockdep_reset_lock(struct lockdep_map *lock)
+{
+ struct lock_class *class, *next;
+ struct list_head *head;
+ unsigned long flags;
+ int i, j;
+
+ raw_local_irq_save(flags);
+
+ /*
+ * Remove all classes this lock might have:
+ */
+ for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
+ /*
+ * If the class exists we look it up and zap it:
+ */
+ class = look_up_lock_class(lock, j);
+ if (class)
+ zap_class(class);
+ }
+ /*
+ * Debug check: in the end all mapped classes should
+ * be gone.
+ */
+ __raw_spin_lock(&hash_lock);
+ for (i = 0; i < CLASSHASH_SIZE; i++) {
+ head = classhash_table + i;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry_safe(class, next, head, hash_entry) {
+ if (unlikely(class == lock->class_cache)) {
+ __raw_spin_unlock(&hash_lock);
+ DEBUG_LOCKS_WARN_ON(1);
+ goto out_restore;
+ }
+ }
+ }
+ __raw_spin_unlock(&hash_lock);
+
+out_restore:
+ raw_local_irq_restore(flags);
+}
+
+void __init lockdep_init(void)
+{
+ int i;
+
+ /*
+ * Some architectures have their own start_kernel()
+ * code which calls lockdep_init(), while we also
+ * call lockdep_init() from the start_kernel() itself,
+ * and we want to initialize the hashes only once:
+ */
+ if (lockdep_initialized)
+ return;
+
+ for (i = 0; i < CLASSHASH_SIZE; i++)
+ INIT_LIST_HEAD(classhash_table + i);
+
+ for (i = 0; i < CHAINHASH_SIZE; i++)
+ INIT_LIST_HEAD(chainhash_table + i);
+
+ lockdep_initialized = 1;
+}
+
+void __init lockdep_info(void)
+{
+ printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
+
+ printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
+ printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
+ printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
+ printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
+ printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
+ printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
+ printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
+
+ printk(" memory used by lock dependency info: %lu kB\n",
+ (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
+ sizeof(struct list_head) * CLASSHASH_SIZE +
+ sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
+ sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
+ sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
+
+ printk(" per task-struct memory footprint: %lu bytes\n",
+ sizeof(struct held_lock) * MAX_LOCK_DEPTH);
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ if (lockdep_init_error)
+ printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
+#endif
+}
+
+static inline int in_range(const void *start, const void *addr, const void *end)
+{
+ return addr >= start && addr <= end;
+}
+
+static void
+print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
+ const void *mem_to, struct held_lock *hlock)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=========================\n");
+ printk( "[ BUG: held lock freed! ]\n");
+ printk( "-------------------------\n");
+ printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
+ curr->comm, curr->pid, mem_from, mem_to-1);
+ print_lock(hlock);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+/*
+ * Called when kernel memory is freed (or unmapped), or if a lock
+ * is destroyed or reinitialized - this code checks whether there is
+ * any held lock in the memory range of <from> to <to>:
+ */
+void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
+{
+ const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
+ struct task_struct *curr = current;
+ struct held_lock *hlock;
+ unsigned long flags;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return;
+
+ local_irq_save(flags);
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ hlock = curr->held_locks + i;
+
+ lock_from = (void *)hlock->instance;
+ lock_to = (void *)(hlock->instance + 1);
+
+ if (!in_range(mem_from, lock_from, mem_to) &&
+ !in_range(mem_from, lock_to, mem_to))
+ continue;
+
+ print_freed_lock_bug(curr, mem_from, mem_to, hlock);
+ break;
+ }
+ local_irq_restore(flags);
+}
+
+static void print_held_locks_bug(struct task_struct *curr)
+{
+ if (!debug_locks_off())
+ return;
+ if (debug_locks_silent)
+ return;
+
+ printk("\n=====================================\n");
+ printk( "[ BUG: lock held at task exit time! ]\n");
+ printk( "-------------------------------------\n");
+ printk("%s/%d is exiting with locks still held!\n",
+ curr->comm, curr->pid);
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+
+void debug_check_no_locks_held(struct task_struct *task)
+{
+ if (unlikely(task->lockdep_depth > 0))
+ print_held_locks_bug(task);
+}
+
+void debug_show_all_locks(void)
+{
+ struct task_struct *g, *p;
+ int count = 10;
+ int unlock = 1;
+
+ printk("\nShowing all locks held in the system:\n");
+
+ /*
+ * Here we try to get the tasklist_lock as hard as possible,
+ * if not successful after 2 seconds we ignore it (but keep
+ * trying). This is to enable a debug printout even if a
+ * tasklist_lock-holding task deadlocks or crashes.
+ */
+retry:
+ if (!read_trylock(&tasklist_lock)) {
+ if (count == 10)
+ printk("hm, tasklist_lock locked, retrying... ");
+ if (count) {
+ count--;
+ printk(" #%d", 10-count);
+ mdelay(200);
+ goto retry;
+ }
+ printk(" ignoring it.\n");
+ unlock = 0;
+ }
+ if (count != 10)
+ printk(" locked it.\n");
+
+ do_each_thread(g, p) {
+ if (p->lockdep_depth)
+ lockdep_print_held_locks(p);
+ if (!unlock)
+ if (read_trylock(&tasklist_lock))
+ unlock = 1;
+ } while_each_thread(g, p);
+
+ printk("\n");
+ printk("=============================================\n\n");
+
+ if (unlock)
+ read_unlock(&tasklist_lock);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_all_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+ lockdep_print_held_locks(task);
+}
+
+EXPORT_SYMBOL_GPL(debug_show_held_locks);
+
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 00000000000..eab043c83bb
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
+/*
+ * kernel/lockdep_internals.h
+ *
+ * Runtime locking correctness validator
+ *
+ * lockdep subsystem internal functions and variables.
+ */
+
+/*
+ * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
+ * we track.
+ *
+ * We use the per-lock dependency maps in two ways: we grow it by adding
+ * every to-be-taken lock to all currently held lock's own dependency
+ * table (if it's not there yet), and we check it for lock order
+ * conflicts and deadlocks.
+ */
+#define MAX_LOCKDEP_ENTRIES 8192UL
+
+#define MAX_LOCKDEP_KEYS_BITS 11
+#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
+
+#define MAX_LOCKDEP_CHAINS_BITS 13
+#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
+
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the hash_lock.
+ */
+#define MAX_STACK_TRACE_ENTRIES 262144UL
+
+extern struct list_head all_lock_classes;
+
+extern void
+get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+
+extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+
+extern unsigned long nr_lock_classes;
+extern unsigned long nr_list_entries;
+extern unsigned long nr_lock_chains;
+extern unsigned long nr_stack_trace_entries;
+
+extern unsigned int nr_hardirq_chains;
+extern unsigned int nr_softirq_chains;
+extern unsigned int nr_process_chains;
+extern unsigned int max_lockdep_depth;
+extern unsigned int max_recursion_depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Various lockdep statistics:
+ */
+extern atomic_t chain_lookup_hits;
+extern atomic_t chain_lookup_misses;
+extern atomic_t hardirqs_on_events;
+extern atomic_t hardirqs_off_events;
+extern atomic_t redundant_hardirqs_on;
+extern atomic_t redundant_hardirqs_off;
+extern atomic_t softirqs_on_events;
+extern atomic_t softirqs_off_events;
+extern atomic_t redundant_softirqs_on;
+extern atomic_t redundant_softirqs_off;
+extern atomic_t nr_unused_locks;
+extern atomic_t nr_cyclic_checks;
+extern atomic_t nr_cyclic_check_recursions;
+extern atomic_t nr_find_usage_forwards_checks;
+extern atomic_t nr_find_usage_forwards_recursions;
+extern atomic_t nr_find_usage_backwards_checks;
+extern atomic_t nr_find_usage_backwards_recursions;
+# define debug_atomic_inc(ptr) atomic_inc(ptr)
+# define debug_atomic_dec(ptr) atomic_dec(ptr)
+# define debug_atomic_read(ptr) atomic_read(ptr)
+#else
+# define debug_atomic_inc(ptr) do { } while (0)
+# define debug_atomic_dec(ptr) do { } while (0)
+# define debug_atomic_read(ptr) 0
+#endif
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
new file mode 100644
index 00000000000..f6e72eaab3f
--- /dev/null
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,345 @@
+/*
+ * kernel/lockdep_proc.c
+ *
+ * Runtime locking correctness validator
+ *
+ * Started by Ingo Molnar:
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Code for /proc/lockdep and /proc/lockdep_stats:
+ *
+ */
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/debug_locks.h>
+
+#include "lockdep_internals.h"
+
+static void *l_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct lock_class *class = v;
+
+ (*pos)++;
+
+ if (class->lock_entry.next != &all_lock_classes)
+ class = list_entry(class->lock_entry.next, struct lock_class,
+ lock_entry);
+ else
+ class = NULL;
+ m->private = class;
+
+ return class;
+}
+
+static void *l_start(struct seq_file *m, loff_t *pos)
+{
+ struct lock_class *class = m->private;
+
+ if (&class->lock_entry == all_lock_classes.next)
+ seq_printf(m, "all lock classes:\n");
+
+ return class;
+}
+
+static void l_stop(struct seq_file *m, void *v)
+{
+}
+
+static unsigned long count_forward_deps(struct lock_class *class)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_after, entry)
+ ret += count_forward_deps(entry->class);
+
+ return ret;
+}
+
+static unsigned long count_backward_deps(struct lock_class *class)
+{
+ struct lock_list *entry;
+ unsigned long ret = 1;
+
+ /*
+ * Recurse this class's dependency list:
+ */
+ list_for_each_entry(entry, &class->locks_before, entry)
+ ret += count_backward_deps(entry->class);
+
+ return ret;
+}
+
+static int l_show(struct seq_file *m, void *v)
+{
+ unsigned long nr_forward_deps, nr_backward_deps;
+ struct lock_class *class = m->private;
+ char str[128], c1, c2, c3, c4;
+ const char *name;
+
+ seq_printf(m, "%p", class->key);
+#ifdef CONFIG_DEBUG_LOCKDEP
+ seq_printf(m, " OPS:%8ld", class->ops);
+#endif
+ nr_forward_deps = count_forward_deps(class);
+ seq_printf(m, " FD:%5ld", nr_forward_deps);
+
+ nr_backward_deps = count_backward_deps(class);
+ seq_printf(m, " BD:%5ld", nr_backward_deps);
+
+ get_usage_chars(class, &c1, &c2, &c3, &c4);
+ seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
+
+ name = class->name;
+ if (!name) {
+ name = __get_key_name(class->key, str);
+ seq_printf(m, ": %s", name);
+ } else{
+ seq_printf(m, ": %s", name);
+ if (class->name_version > 1)
+ seq_printf(m, "#%d", class->name_version);
+ if (class->subclass)
+ seq_printf(m, "/%d", class->subclass);
+ }
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static struct seq_operations lockdep_ops = {
+ .start = l_start,
+ .next = l_next,
+ .stop = l_stop,
+ .show = l_show,
+};
+
+static int lockdep_open(struct inode *inode, struct file *file)
+{
+ int res = seq_open(file, &lockdep_ops);
+ if (!res) {
+ struct seq_file *m = file->private_data;
+
+ if (!list_empty(&all_lock_classes))
+ m->private = list_entry(all_lock_classes.next,
+ struct lock_class, lock_entry);
+ else
+ m->private = NULL;
+ }
+ return res;
+}
+
+static struct file_operations proc_lockdep_operations = {
+ .open = lockdep_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void lockdep_stats_debug_show(struct seq_file *m)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+ unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
+ hi2 = debug_atomic_read(&hardirqs_off_events),
+ hr1 = debug_atomic_read(&redundant_hardirqs_on),
+ hr2 = debug_atomic_read(&redundant_hardirqs_off),
+ si1 = debug_atomic_read(&softirqs_on_events),
+ si2 = debug_atomic_read(&softirqs_off_events),
+ sr1 = debug_atomic_read(&redundant_softirqs_on),
+ sr2 = debug_atomic_read(&redundant_softirqs_off);
+
+ seq_printf(m, " chain lookup misses: %11u\n",
+ debug_atomic_read(&chain_lookup_misses));
+ seq_printf(m, " chain lookup hits: %11u\n",
+ debug_atomic_read(&chain_lookup_hits));
+ seq_printf(m, " cyclic checks: %11u\n",
+ debug_atomic_read(&nr_cyclic_checks));
+ seq_printf(m, " cyclic-check recursions: %11u\n",
+ debug_atomic_read(&nr_cyclic_check_recursions));
+ seq_printf(m, " find-mask forwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_checks));
+ seq_printf(m, " find-mask forwards recursions: %11u\n",
+ debug_atomic_read(&nr_find_usage_forwards_recursions));
+ seq_printf(m, " find-mask backwards checks: %11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_checks));
+ seq_printf(m, " find-mask backwards recursions:%11u\n",
+ debug_atomic_read(&nr_find_usage_backwards_recursions));
+
+ seq_printf(m, " hardirq on events: %11u\n", hi1);
+ seq_printf(m, " hardirq off events: %11u\n", hi2);
+ seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
+ seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
+ seq_printf(m, " softirq on events: %11u\n", si1);
+ seq_printf(m, " softirq off events: %11u\n", si2);
+ seq_printf(m, " redundant softirq ons: %11u\n", sr1);
+ seq_printf(m, " redundant softirq offs: %11u\n", sr2);
+#endif
+}
+
+static int lockdep_stats_show(struct seq_file *m, void *v)
+{
+ struct lock_class *class;
+ unsigned long nr_unused = 0, nr_uncategorized = 0,
+ nr_irq_safe = 0, nr_irq_unsafe = 0,
+ nr_softirq_safe = 0, nr_softirq_unsafe = 0,
+ nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
+ nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
+ nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
+ nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
+ sum_forward_deps = 0, factor = 0;
+
+ list_for_each_entry(class, &all_lock_classes, lock_entry) {
+
+ if (class->usage_mask == 0)
+ nr_unused++;
+ if (class->usage_mask == LOCKF_USED)
+ nr_uncategorized++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ)
+ nr_irq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS)
+ nr_irq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
+ nr_softirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
+ nr_softirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
+ nr_hardirq_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
+ nr_hardirq_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
+ nr_irq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
+ nr_irq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
+ nr_softirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
+ nr_softirq_read_unsafe++;
+ if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
+ nr_hardirq_read_safe++;
+ if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
+ nr_hardirq_read_unsafe++;
+
+ sum_forward_deps += count_forward_deps(class);
+ }
+#ifdef CONFIG_LOCKDEP_DEBUG
+ DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
+#endif
+ seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
+ nr_lock_classes, MAX_LOCKDEP_KEYS);
+ seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
+ nr_list_entries, MAX_LOCKDEP_ENTRIES);
+ seq_printf(m, " indirect dependencies: %11lu\n",
+ sum_forward_deps);
+
+ /*
+ * Total number of dependencies:
+ *
+ * All irq-safe locks may nest inside irq-unsafe locks,
+ * plus all the other known dependencies:
+ */
+ seq_printf(m, " all direct dependencies: %11lu\n",
+ nr_irq_unsafe * nr_irq_safe +
+ nr_hardirq_unsafe * nr_hardirq_safe +
+ nr_list_entries);
+
+ /*
+ * Estimated factor between direct and indirect
+ * dependencies:
+ */
+ if (nr_list_entries)
+ factor = sum_forward_deps / nr_list_entries;
+
+ seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
+ nr_lock_chains, MAX_LOCKDEP_CHAINS);
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ seq_printf(m, " in-hardirq chains: %11u\n",
+ nr_hardirq_chains);
+ seq_printf(m, " in-softirq chains: %11u\n",
+ nr_softirq_chains);
+#endif
+ seq_printf(m, " in-process chains: %11u\n",
+ nr_process_chains);
+ seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
+ nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+ seq_printf(m, " combined max dependencies: %11u\n",
+ (nr_hardirq_chains + 1) *
+ (nr_softirq_chains + 1) *
+ (nr_process_chains + 1)
+ );
+ seq_printf(m, " hardirq-safe locks: %11lu\n",
+ nr_hardirq_safe);
+ seq_printf(m, " hardirq-unsafe locks: %11lu\n",
+ nr_hardirq_unsafe);
+ seq_printf(m, " softirq-safe locks: %11lu\n",
+ nr_softirq_safe);
+ seq_printf(m, " softirq-unsafe locks: %11lu\n",
+ nr_softirq_unsafe);
+ seq_printf(m, " irq-safe locks: %11lu\n",
+ nr_irq_safe);
+ seq_printf(m, " irq-unsafe locks: %11lu\n",
+ nr_irq_unsafe);
+
+ seq_printf(m, " hardirq-read-safe locks: %11lu\n",
+ nr_hardirq_read_safe);
+ seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
+ nr_hardirq_read_unsafe);
+ seq_printf(m, " softirq-read-safe locks: %11lu\n",
+ nr_softirq_read_safe);
+ seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
+ nr_softirq_read_unsafe);
+ seq_printf(m, " irq-read-safe locks: %11lu\n",
+ nr_irq_read_safe);
+ seq_printf(m, " irq-read-unsafe locks: %11lu\n",
+ nr_irq_read_unsafe);
+
+ seq_printf(m, " uncategorized locks: %11lu\n",
+ nr_uncategorized);
+ seq_printf(m, " unused locks: %11lu\n",
+ nr_unused);
+ seq_printf(m, " max locking depth: %11u\n",
+ max_lockdep_depth);
+ seq_printf(m, " max recursion depth: %11u\n",
+ max_recursion_depth);
+ lockdep_stats_debug_show(m);
+ seq_printf(m, " debug_locks: %11u\n",
+ debug_locks);
+
+ return 0;
+}
+
+static int lockdep_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lockdep_stats_show, NULL);
+}
+
+static struct file_operations proc_lockdep_stats_operations = {
+ .open = lockdep_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init lockdep_proc_init(void)
+{
+ struct proc_dir_entry *entry;
+
+ entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_lockdep_operations;
+
+ entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
+ if (entry)
+ entry->proc_fops = &proc_lockdep_stats_operations;
+
+ return 0;
+}
+
+__initcall(lockdep_proc_init);
+
diff --git a/kernel/module.c b/kernel/module.c
index 281172f01e9..2a19cd47c04 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1121,6 +1121,9 @@ static void free_module(struct module *mod)
if (mod->percpu)
percpu_modfree(mod->percpu);
+ /* Free lock-classes: */
+ lockdep_free_key_range(mod->module_core, mod->core_size);
+
/* Finally, free the core (containing the module structure) */
module_free(mod, mod->module_core);
}
@@ -2016,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
return NULL;
}
-struct module *module_get_kallsym(unsigned int symnum,
- unsigned long *value,
- char *type,
- char namebuf[128])
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+ char *type, char *name, size_t namelen)
{
struct module *mod;
@@ -2028,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
- strncpy(namebuf,
- mod->strtab + mod->symtab[symnum].st_name,
- 127);
+ strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+ namelen);
mutex_unlock(&module_mutex);
return mod;
}
@@ -2159,6 +2159,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
return e;
}
+/*
+ * Is this a valid module address?
+ */
+int is_module_address(unsigned long addr)
+{
+ unsigned long flags;
+ struct module *mod;
+
+ spin_lock_irqsave(&modlist_lock, flags);
+
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_core, mod->core_size)) {
+ spin_unlock_irqrestore(&modlist_lock, flags);
+ return 1;
+ }
+ }
+
+ spin_unlock_irqrestore(&modlist_lock, flags);
+
+ return 0;
+}
+
+
/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
struct module *__module_text_address(unsigned long addr)
{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index e38e4bac97c..e3203c654dd 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -20,367 +20,19 @@
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
#include "mutex-debug.h"
/*
- * We need a global lock when we walk through the multi-process
- * lock tree. Only used in the deadlock-debugging case.
- */
-DEFINE_SPINLOCK(debug_mutex_lock);
-
-/*
- * All locks held by all tasks, in a single global list:
- */
-LIST_HEAD(debug_mutex_held_locks);
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__ , unsigned long ip
-#define __IP__ , ip
-#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
-
-/*
- * "mutex debugging enabled" flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-int debug_mutex_on = 1;
-
-static void printk_task(struct task_struct *p)
-{
- if (p)
- printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_ti(struct thread_info *ti)
-{
- if (ti)
- printk_task(ti->task);
- else
- printk("<none>");
-}
-
-static void printk_task_short(struct task_struct *p)
-{
- if (p)
- printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
- else
- printk("<none>");
-}
-
-static void printk_lock(struct mutex *lock, int print_owner)
-{
- printk(" [%p] {%s}\n", lock, lock->name);
-
- if (print_owner && lock->owner) {
- printk(".. held by: ");
- printk_ti(lock->owner);
- printk("\n");
- }
- if (lock->owner) {
- printk("... acquired at: ");
- print_symbol("%s\n", lock->acquire_ip);
- }
-}
-
-/*
- * printk locks held by a task:
- */
-static void show_task_locks(struct task_struct *p)
-{
- switch (p->state) {
- case TASK_RUNNING: printk("R"); break;
- case TASK_INTERRUPTIBLE: printk("S"); break;
- case TASK_UNINTERRUPTIBLE: printk("D"); break;
- case TASK_STOPPED: printk("T"); break;
- case EXIT_ZOMBIE: printk("Z"); break;
- case EXIT_DEAD: printk("X"); break;
- default: printk("?"); break;
- }
- printk_task(p);
- if (p->blocked_on) {
- struct mutex *lock = p->blocked_on->lock;
-
- printk(" blocked on mutex:");
- printk_lock(lock, 1);
- } else
- printk(" (not blocked on mutex)\n");
-}
-
-/*
- * printk all locks held in the system (if filter == NULL),
- * or all locks belonging to a single task (if filter != NULL):
- */
-void show_held_locks(struct task_struct *filter)
-{
- struct list_head *curr, *cursor = NULL;
- struct mutex *lock;
- struct thread_info *t;
- unsigned long flags;
- int count = 0;
-
- if (filter) {
- printk("------------------------------\n");
- printk("| showing all locks held by: | (");
- printk_task_short(filter);
- printk("):\n");
- printk("------------------------------\n");
- } else {
- printk("---------------------------\n");
- printk("| showing all locks held: |\n");
- printk("---------------------------\n");
- }
-
- /*
- * Play safe and acquire the global trace lock. We
- * cannot printk with that lock held so we iterate
- * very carefully:
- */
-next:
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each(curr, &debug_mutex_held_locks) {
- if (cursor && curr != cursor)
- continue;
- lock = list_entry(curr, struct mutex, held_list);
- t = lock->owner;
- if (filter && (t != filter->thread_info))
- continue;
- count++;
- cursor = curr->next;
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
-
- printk("\n#%03d: ", count);
- printk_lock(lock, filter ? 0 : 1);
- goto next;
- }
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
- printk("\n");
-}
-
-void mutex_debug_show_all_locks(void)
-{
- struct task_struct *g, *p;
- int count = 10;
- int unlock = 1;
-
- printk("\nShowing all blocking locks in the system:\n");
-
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- printk(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- printk(" ignoring it.\n");
- unlock = 0;
- }
- if (count != 10)
- printk(" locked it.\n");
-
- do_each_thread(g, p) {
- show_task_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
-
- printk("\n");
- show_held_locks(NULL);
- printk("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
-}
-
-static void report_deadlock(struct task_struct *task, struct mutex *lock,
- struct mutex *lockblk, unsigned long ip)
-{
- printk("\n%s/%d is trying to acquire this lock:\n",
- current->comm, current->pid);
- printk_lock(lock, 1);
- printk("... trying at: ");
- print_symbol("%s\n", ip);
- show_held_locks(current);
-
- if (lockblk) {
- printk("but %s/%d is deadlocking current task %s/%d!\n\n",
- task->comm, task->pid, current->comm, current->pid);
- printk("\n%s/%d is blocked on this lock:\n",
- task->comm, task->pid);
- printk_lock(lockblk, 1);
-
- show_held_locks(task);
-
- printk("\n%s/%d's [blocked] stackdump:\n\n",
- task->comm, task->pid);
- show_stack(task, NULL);
- }
-
- printk("\n%s/%d's [current] stackdump:\n\n",
- current->comm, current->pid);
- dump_stack();
- mutex_debug_show_all_locks();
- printk("[ turning off deadlock detection. Please report this. ]\n\n");
- local_irq_disable();
-}
-
-/*
- * Recursively check for mutex deadlocks:
- */
-static int check_deadlock(struct mutex *lock, int depth,
- struct thread_info *ti, unsigned long ip)
-{
- struct mutex *lockblk;
- struct task_struct *task;
-
- if (!debug_mutex_on)
- return 0;
-
- ti = lock->owner;
- if (!ti)
- return 0;
-
- task = ti->task;
- lockblk = NULL;
- if (task->blocked_on)
- lockblk = task->blocked_on->lock;
-
- /* Self-deadlock: */
- if (current == task) {
- DEBUG_OFF();
- if (depth)
- return 1;
- printk("\n==========================================\n");
- printk( "[ BUG: lock recursion deadlock detected! |\n");
- printk( "------------------------------------------\n");
- report_deadlock(task, lock, NULL, ip);
- return 0;
- }
-
- /* Ugh, something corrupted the lock data structure? */
- if (depth > 20) {
- DEBUG_OFF();
- printk("\n===========================================\n");
- printk( "[ BUG: infinite lock dependency detected!? |\n");
- printk( "-------------------------------------------\n");
- report_deadlock(task, lock, lockblk, ip);
- return 0;
- }
-
- /* Recursively check for dependencies: */
- if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
- printk("\n============================================\n");
- printk( "[ BUG: circular locking deadlock detected! ]\n");
- printk( "--------------------------------------------\n");
- report_deadlock(task, lock, lockblk, ip);
- return 0;
- }
- return 0;
-}
-
-/*
- * Called when a task exits, this function checks whether the
- * task is holding any locks, and reports the first one if so:
- */
-void mutex_debug_check_no_locks_held(struct task_struct *task)
-{
- struct list_head *curr, *next;
- struct thread_info *t;
- unsigned long flags;
- struct mutex *lock;
-
- if (!debug_mutex_on)
- return;
-
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each_safe(curr, next, &debug_mutex_held_locks) {
- lock = list_entry(curr, struct mutex, held_list);
- t = lock->owner;
- if (t != task->thread_info)
- continue;
- list_del_init(curr);
- DEBUG_OFF();
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
-
- printk("BUG: %s/%d, lock held at task exit time!\n",
- task->comm, task->pid);
- printk_lock(lock, 1);
- if (lock->owner != task->thread_info)
- printk("exiting task is not even the owner??\n");
- return;
- }
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
-}
-
-/*
- * Called when kernel memory is freed (or unmapped), or if a mutex
- * is destroyed or reinitialized - this code checks whether there is
- * any held lock in the memory range of <from> to <to>:
- */
-void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
-{
- struct list_head *curr, *next;
- const void *to = from + len;
- unsigned long flags;
- struct mutex *lock;
- void *lock_addr;
-
- if (!debug_mutex_on)
- return;
-
- debug_spin_lock_save(&debug_mutex_lock, flags);
- list_for_each_safe(curr, next, &debug_mutex_held_locks) {
- lock = list_entry(curr, struct mutex, held_list);
- lock_addr = lock;
- if (lock_addr < from || lock_addr >= to)
- continue;
- list_del_init(curr);
- DEBUG_OFF();
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
-
- printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
- current->comm, current->pid, lock, from, to);
- dump_stack();
- printk_lock(lock, 1);
- if (lock->owner != current_thread_info())
- printk("freeing task is not even the owner??\n");
- return;
- }
- debug_spin_unlock_restore(&debug_mutex_lock, flags);
-}
-
-/*
* Must be called with lock->wait_lock held.
*/
-void debug_mutex_set_owner(struct mutex *lock,
- struct thread_info *new_owner __IP_DECL__)
+void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
{
lock->owner = new_owner;
- DEBUG_WARN_ON(!list_empty(&lock->held_list));
- if (debug_mutex_on) {
- list_add_tail(&lock->held_list, &debug_mutex_held_locks);
- lock->acquire_ip = ip;
- }
}
-void debug_mutex_init_waiter(struct mutex_waiter *waiter)
+void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
@@ -389,23 +41,23 @@ void debug_mutex_init_waiter(struct mutex_waiter *waiter)
void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
{
- SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
- DEBUG_WARN_ON(list_empty(&lock->wait_list));
- DEBUG_WARN_ON(waiter->magic != waiter);
- DEBUG_WARN_ON(list_empty(&waiter->list));
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+ DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
+ DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
}
void debug_mutex_free_waiter(struct mutex_waiter *waiter)
{
- DEBUG_WARN_ON(!list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti __IP_DECL__)
+ struct thread_info *ti)
{
- SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock));
- check_deadlock(lock, 0, ti, ip);
+ SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
+
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
waiter->lock = lock;
@@ -414,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti)
{
- DEBUG_WARN_ON(list_empty(&waiter->list));
- DEBUG_WARN_ON(waiter->task != ti->task);
- DEBUG_WARN_ON(ti->task->blocked_on != waiter);
+ DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
+ DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+ DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
ti->task->blocked_on = NULL;
list_del_init(&waiter->list);
@@ -425,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
void debug_mutex_unlock(struct mutex *lock)
{
- DEBUG_WARN_ON(lock->magic != lock);
- DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
- DEBUG_WARN_ON(lock->owner != current_thread_info());
- if (debug_mutex_on) {
- DEBUG_WARN_ON(list_empty(&lock->held_list));
- list_del_init(&lock->held_list);
- }
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+ DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
+ DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
}
-void debug_mutex_init(struct mutex *lock, const char *name)
+void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key)
{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make sure we are not reinitializing a held lock:
*/
- mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lockdep_init_map(&lock->dep_map, name, key);
+#endif
lock->owner = NULL;
- INIT_LIST_HEAD(&lock->held_list);
- lock->name = name;
lock->magic = lock;
}
@@ -456,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
*/
void fastcall mutex_destroy(struct mutex *lock)
{
- DEBUG_WARN_ON(mutex_is_locked(lock));
+ DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
lock->magic = NULL;
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index a5196c36a5f..babfbdfc534 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -10,110 +10,44 @@
* More details are in kernel/mutex-debug.c.
*/
-extern spinlock_t debug_mutex_lock;
-extern struct list_head debug_mutex_held_locks;
-extern int debug_mutex_on;
-
-/*
- * In the debug case we carry the caller's instruction pointer into
- * other functions, but we dont want the function argument overhead
- * in the nondebug case - hence these macros:
- */
-#define __IP_DECL__ , unsigned long ip
-#define __IP__ , ip
-#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
-
/*
* This must be called with lock->wait_lock held.
*/
-extern void debug_mutex_set_owner(struct mutex *lock,
- struct thread_info *new_owner __IP_DECL__);
+extern void
+debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
static inline void debug_mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
-extern void debug_mutex_init_waiter(struct mutex_waiter *waiter);
+extern void debug_mutex_lock_common(struct mutex *lock,
+ struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
- struct thread_info *ti __IP_DECL__);
+ struct thread_info *ti);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti);
extern void debug_mutex_unlock(struct mutex *lock);
-extern void debug_mutex_init(struct mutex *lock, const char *name);
-
-#define debug_spin_lock_save(lock, flags) \
- do { \
- local_irq_save(flags); \
- if (debug_mutex_on) \
- spin_lock(lock); \
- } while (0)
-
-#define debug_spin_unlock_restore(lock, flags) \
- do { \
- if (debug_mutex_on) \
- spin_unlock(lock); \
- local_irq_restore(flags); \
- preempt_check_resched(); \
- } while (0)
+extern void debug_mutex_init(struct mutex *lock, const char *name,
+ struct lock_class_key *key);
#define spin_lock_mutex(lock, flags) \
do { \
struct mutex *l = container_of(lock, struct mutex, wait_lock); \
\
- DEBUG_WARN_ON(in_interrupt()); \
- debug_spin_lock_save(&debug_mutex_lock, flags); \
- spin_lock(lock); \
- DEBUG_WARN_ON(l->magic != l); \
+ DEBUG_LOCKS_WARN_ON(in_interrupt()); \
+ local_irq_save(flags); \
+ __raw_spin_lock(&(lock)->raw_lock); \
+ DEBUG_LOCKS_WARN_ON(l->magic != l); \
} while (0)
#define spin_unlock_mutex(lock, flags) \
do { \
- spin_unlock(lock); \
- debug_spin_unlock_restore(&debug_mutex_lock, flags); \
+ __raw_spin_unlock(&(lock)->raw_lock); \
+ local_irq_restore(flags); \
+ preempt_check_resched(); \
} while (0)
-
-#define DEBUG_OFF() \
-do { \
- if (debug_mutex_on) { \
- debug_mutex_on = 0; \
- console_verbose(); \
- if (spin_is_locked(&debug_mutex_lock)) \
- spin_unlock(&debug_mutex_lock); \
- } \
-} while (0)
-
-#define DEBUG_BUG() \
-do { \
- if (debug_mutex_on) { \
- DEBUG_OFF(); \
- BUG(); \
- } \
-} while (0)
-
-#define DEBUG_WARN_ON(c) \
-do { \
- if (unlikely(c && debug_mutex_on)) { \
- DEBUG_OFF(); \
- WARN_ON(1); \
- } \
-} while (0)
-
-# define DEBUG_BUG_ON(c) \
-do { \
- if (unlikely(c)) \
- DEBUG_BUG(); \
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
-# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
-#else
-# define SMP_DEBUG_WARN_ON(c) do { } while (0)
-# define SMP_DEBUG_BUG_ON(c) do { } while (0)
-#endif
-
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 7043db21bbc..8c71cf72a49 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -38,13 +39,14 @@
*
* It is not allowed to initialize an already locked mutex.
*/
-void fastcall __mutex_init(struct mutex *lock, const char *name)
+void
+__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
- debug_mutex_init(lock, name);
+ debug_mutex_init(lock, name, key);
}
EXPORT_SYMBOL(__mutex_init);
@@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
* branch is predicted by the CPU as default-untaken.
*/
static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_slowpath(atomic_t *lock_count);
/***
* mutex_lock - acquire the mutex
@@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
*
* This function is similar to (but not equivalent to) down().
*/
-void fastcall __sched mutex_lock(struct mutex *lock)
+void inline fastcall __sched mutex_lock(struct mutex *lock)
{
might_sleep();
/*
@@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
static void fastcall noinline __sched
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_unlock_slowpath(atomic_t *lock_count);
/***
* mutex_unlock - release the mutex
@@ -120,18 +122,18 @@ EXPORT_SYMBOL(mutex_unlock);
* Lock a mutex (possibly interruptible), slowpath:
*/
static inline int __sched
-__mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
+__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned int old_val;
unsigned long flags;
- debug_mutex_init_waiter(&waiter);
-
spin_lock_mutex(&lock->wait_lock, flags);
- debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
+ debug_mutex_lock_common(lock, &waiter);
+ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ debug_mutex_add_waiter(lock, &waiter, task->thread_info);
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -158,6 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
if (unlikely(state == TASK_INTERRUPTIBLE &&
signal_pending(task))) {
mutex_remove_waiter(lock, &waiter, task->thread_info);
+ mutex_release(&lock->dep_map, 1, _RET_IP_);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
@@ -173,7 +176,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
/* got the lock - rejoice! */
mutex_remove_waiter(lock, &waiter, task->thread_info);
- debug_mutex_set_owner(lock, task->thread_info __IP__);
+ debug_mutex_set_owner(lock, task->thread_info);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@@ -183,32 +186,40 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
debug_mutex_free_waiter(&waiter);
- DEBUG_WARN_ON(list_empty(&lock->held_list));
- DEBUG_WARN_ON(lock->owner != task->thread_info);
-
return 0;
}
static void fastcall noinline __sched
-__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void __sched
+mutex_lock_nested(struct mutex *lock, unsigned int subclass)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
}
+EXPORT_SYMBOL_GPL(mutex_lock_nested);
+#endif
+
/*
* Release the lock, slowpath:
*/
-static fastcall noinline void
-__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
+static fastcall inline void
+__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
- DEBUG_WARN_ON(lock->owner != current_thread_info());
-
spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
/*
* some architectures leave the lock unlocked in the fastpath failure
@@ -218,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
- debug_mutex_unlock(lock);
-
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
@@ -237,11 +246,20 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
}
/*
+ * Release the lock, slowpath:
+ */
+static fastcall noinline void
+__mutex_unlock_slowpath(atomic_t *lock_count)
+{
+ __mutex_unlock_common_slowpath(lock_count, 1);
+}
+
+/*
* Here come the less common (and hence less performance-critical) APIs:
* mutex_lock_interruptible() and mutex_trylock().
*/
static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__);
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
/***
* mutex_lock_interruptible - acquire the mutex, interruptable
@@ -264,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock_interruptible);
static int fastcall noinline __sched
-__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
+__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
}
/*
@@ -284,8 +302,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
- if (likely(prev == 1))
- debug_mutex_set_owner(lock, current_thread_info() __RET_IP__);
+ if (likely(prev == 1)) {
+ debug_mutex_set_owner(lock, current_thread_info());
+ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ }
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -309,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
* This function must not be used in interrupt context. The
* mutex must be released by the same task that acquired it.
*/
-int fastcall mutex_trylock(struct mutex *lock)
+int fastcall __sched mutex_trylock(struct mutex *lock)
{
return __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 06918994725..a075dafbb29 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -16,22 +16,15 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
-#define DEBUG_WARN_ON(c) do { } while (0)
#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
#define debug_mutex_clear_owner(lock) do { } while (0)
-#define debug_mutex_init_waiter(waiter) do { } while (0)
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
-#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0)
+#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
#define debug_mutex_unlock(lock) do { } while (0)
-#define debug_mutex_init(lock, name) do { } while (0)
-
-/*
- * Return-address parameters/declarations. They are very useful for
- * debugging, but add overhead in the !DEBUG case - so we go the
- * trouble of using this not too elegant but zero-cost solution:
- */
-#define __IP_DECL__
-#define __IP__
-#define __RET_IP__
+#define debug_mutex_init(lock, name, key) do { } while (0)
+static inline void
+debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
+{
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index ab13f0f668b..8010b9b17ac 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/nmi.h>
#include <linux/kexec.h>
+#include <linux/debug_locks.h>
int panic_on_oops;
int tainted;
@@ -172,6 +173,7 @@ const char *print_tainted(void)
void add_taint(unsigned flag)
{
+ debug_locks = 0; /* can't trust the integrity of the kernel anymore */
tainted |= flag;
}
EXPORT_SYMBOL(add_taint);
@@ -256,6 +258,7 @@ int oops_may_print(void)
*/
void oops_enter(void)
{
+ debug_locks_off(); /* can't trust the integrity of the kernel anymore */
do_oops_enter_exit();
}
diff --git a/kernel/pid.c b/kernel/pid.c
index eeb836b65ca..93e212f2067 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,7 +218,7 @@ struct pid * fastcall find_pid(int nr)
return NULL;
}
-int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
+int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
{
struct pid_link *link;
struct pid *pid;
@@ -233,7 +233,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
return 0;
}
-void fastcall detach_pid(task_t *task, enum pid_type type)
+void fastcall detach_pid(struct task_struct *task, enum pid_type type)
{
struct pid_link *link;
struct pid *pid;
@@ -267,7 +267,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
/*
* Must be called under rcu_read_lock() or with tasklist_lock read-held.
*/
-task_t *find_task_by_pid_type(int type, int nr)
+struct task_struct *find_task_by_pid_type(int type, int nr)
{
return pid_task(find_pid(nr), type);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ae44a70aae8..619ecabf7c5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -56,7 +56,7 @@ config PM_TRACE
config SOFTWARE_SUSPEND
bool "Software Suspend"
- depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
+ depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
---help---
Enable the possibility of suspending the machine.
It doesn't need ACPI or APM.
@@ -78,6 +78,10 @@ config SOFTWARE_SUSPEND
For more information take a look at <file:Documentation/power/swsusp.txt>.
+ (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386.
+ we need identity mapping for resume to work, and that is trivial
+ to get with 4MB pages, but less than trivial on PAE).
+
config PM_STD_PARTITION
string "Default resume partition"
depends on SOFTWARE_SUSPEND
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcf..c50d15266c1 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
return dev;
}
-static void __pm_unregister(struct pm_dev *dev)
-{
- if (dev) {
- list_del(&dev->entry);
- kfree(dev);
- }
-}
-
-/**
- * pm_unregister_all - unregister all devices with matching callback
- * @callback: callback function pointer
- *
- * Unregister every device that would call the callback passed. This
- * is primarily meant as a helper function for loadable modules. It
- * enables a module to give up all its managed devices without keeping
- * its own private list.
- */
-
-void pm_unregister_all(pm_callback callback)
-{
- struct list_head *entry;
-
- if (!callback)
- return;
-
- mutex_lock(&pm_devs_lock);
- entry = pm_devs.next;
- while (entry != &pm_devs) {
- struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
- entry = entry->next;
- if (dev->callback == callback)
- __pm_unregister(dev);
- }
- mutex_unlock(&pm_devs_lock);
-}
-
/**
* pm_send - send request to a single device
* @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
}
EXPORT_SYMBOL(pm_register);
-EXPORT_SYMBOL(pm_unregister_all);
EXPORT_SYMBOL(pm_send_all);
EXPORT_SYMBOL(pm_active);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b2a5f671d6c..72e72d2c61e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p)
}
}
+static void cancel_freezing(struct task_struct *p)
+{
+ unsigned long flags;
+
+ if (freezing(p)) {
+ pr_debug(" clean up: %s\n", p->comm);
+ do_not_freeze(p);
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ recalc_sigpending_tsk(p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+}
+
/* 0 = success, else # of processes that we failed to stop */
int freeze_processes(void)
{
int todo, nr_user, user_frozen;
unsigned long start_time;
struct task_struct *g, *p;
- unsigned long flags;
printk( "Stopping tasks: " );
start_time = jiffies;
@@ -85,6 +97,10 @@ int freeze_processes(void)
continue;
if (frozen(p))
continue;
+ if (p->state == TASK_TRACED && frozen(p->parent)) {
+ cancel_freezing(p);
+ continue;
+ }
if (p->mm && !(p->flags & PF_BORROWED_MM)) {
/* The task is a user-space one.
* Freeze it unless there's a vfork completion
@@ -126,13 +142,7 @@ int freeze_processes(void)
do_each_thread(g, p) {
if (freezeable(p) && !frozen(p))
printk(KERN_ERR " %s\n", p->comm);
- if (freezing(p)) {
- pr_debug(" clean up: %s\n", p->comm);
- p->flags &= ~PF_FREEZE;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- recalc_sigpending_tsk(p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
+ cancel_freezing(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
return todo;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 24c96f35423..75d4886e648 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
if (saveable(zone, &zone_pfn)) {
struct page *page;
+ long *src, *dst;
+ int n;
+
page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
BUG_ON(!pbe);
pbe->orig_address = (unsigned long)page_address(page);
- /* copy_page is not usable for copying task structs. */
- memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+ /* copy_page and memcpy are not usable for copying task structs. */
+ dst = (long *)pbe->address;
+ src = (long *)pbe->orig_address;
+ for (n = PAGE_SIZE / sizeof(long); n; n--)
+ *dst++ = *src++;
pbe = pbe->next;
}
}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c102..f1dd146bd64 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
struct swap_map_handle handle;
struct snapshot_handle snapshot;
struct swsusp_info *header;
- unsigned long start;
int error;
if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
}
error = get_swap_writer(&handle);
if (!error) {
- start = handle.cur_swap;
+ unsigned long start = handle.cur_swap;
error = swap_write_page(&handle, header);
- }
- if (!error)
- error = save_image(&handle, &snapshot, header->pages - 1);
- if (!error) {
- flush_swap_writer(&handle);
- printk("S");
- error = mark_swapfiles(swp_entry(root_swap, start));
- printk("|\n");
+ if (!error)
+ error = save_image(&handle, &snapshot,
+ header->pages - 1);
+ if (!error) {
+ flush_swap_writer(&handle);
+ printk("S");
+ error = mark_swapfiles(swp_entry(root_swap, start));
+ printk("|\n");
+ }
}
if (error)
free_all_swap_pages(root_swap, handle.bitmap);
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
static int end_io(struct bio *bio, unsigned int num, int err)
{
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- panic("I/O error reading memory image");
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ printk(KERN_ERR "I/O error reading swsusp image.\n");
+ return -EIO;
+ }
atomic_set(&io_done, 0);
return 0;
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 39ae24d2a41..1149365e989 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -52,7 +52,7 @@ int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
};
-EXPORT_SYMBOL(console_printk);
+EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
/*
* Low lever drivers may need that to know if they can schedule in
@@ -518,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
zap_locks();
/* This stops the holder of console_sem just where we want him */
- spin_lock_irqsave(&logbuf_lock, flags);
+ local_irq_save(flags);
+ lockdep_off();
+ spin_lock(&logbuf_lock);
printk_cpu = smp_processor_id();
/* Emit the output into the temporary buffer */
@@ -588,7 +590,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
*/
console_locked = 1;
printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ spin_unlock(&logbuf_lock);
/*
* Console drivers may assume that per-cpu resources have
@@ -604,6 +606,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
console_locked = 0;
up(&console_sem);
}
+ lockdep_on();
+ local_irq_restore(flags);
} else {
/*
* Someone else owns the drivers. We drop the spinlock, which
@@ -611,7 +615,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* console drivers with the output which we just produced.
*/
printk_cpu = UINT_MAX;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
}
preempt_enable();
@@ -767,7 +773,7 @@ int is_console_locked(void)
{
return console_locked;
}
-EXPORT_SYMBOL(is_console_locked);
+EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
/**
* release_console_sem - unlock the console system
@@ -793,6 +799,9 @@ void release_console_sem(void)
up(&secondary_console_sem);
return;
}
+
+ console_may_schedule = 0;
+
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
@@ -806,11 +815,17 @@ void release_console_sem(void)
local_irq_restore(flags);
}
console_locked = 0;
- console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
- wake_up_interruptible(&log_wait);
+ if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
+ /*
+ * If we printk from within the lock dependency code,
+ * from within the scheduler code, then do not lock
+ * up due to self-recursion:
+ */
+ if (!lockdep_internal())
+ wake_up_interruptible(&log_wait);
+ }
}
EXPORT_SYMBOL(release_console_sem);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335c5b932e1..9a111f70145 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,7 +28,7 @@
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(task_t *child, task_t *new_parent)
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
BUG_ON(!list_empty(&child->ptrace_list));
if (child->parent == new_parent)
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent)
* TASK_TRACED, resume it now.
* Requires that irqs be disabled.
*/
-void ptrace_untrace(task_t *child)
+void ptrace_untrace(struct task_struct *child)
{
spin_lock(&child->sighand->siglock);
if (child->state == TASK_TRACED) {
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child)
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_unlink(task_t *child)
+void __ptrace_unlink(struct task_struct *child)
{
BUG_ON(!child->ptrace);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f464f5ae3f1..523e46483b9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,13 +53,13 @@
static struct rcu_ctrlblk rcu_ctrlblk = {
.cur = -300,
.completed = -300,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.cur = -300,
.completed = -300,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
.cpumask = CPU_MASK_NONE,
};
@@ -241,12 +241,16 @@ static void rcu_do_batch(struct rcu_data *rdp)
next = rdp->donelist = list->next;
list->func(list);
list = next;
- rdp->qlen--;
if (++count >= rdp->blimit)
break;
}
+
+ local_irq_disable();
+ rdp->qlen -= count;
+ local_irq_enable();
if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
rdp->blimit = blimit;
+
if (!rdp->donelist)
rdp->donetail = &rdp->donelist;
else
@@ -548,7 +552,7 @@ static void __devinit rcu_online_cpu(int cpu)
tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
}
-static int __devinit rcu_cpu_notify(struct notifier_block *self,
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -565,7 +569,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block __cpuinitdata rcu_nb = {
.notifier_call = rcu_cpu_notify,
};
diff --git a/kernel/resource.c b/kernel/resource.c
index 129cf046e56..46286434af8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res)
start = res->start;
end = res->end;
+ BUG_ON(start >= end);
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
@@ -254,15 +255,17 @@ int find_next_system_ram(struct resource *res)
p = NULL;
break;
}
- if (p->start >= start)
+ if ((p->end >= start) && (p->start < end))
break;
}
read_unlock(&resource_lock);
if (!p)
return -1;
/* copy data */
- res->start = p->start;
- res->end = p->end;
+ if (res->start < p->start)
+ res->start = p->start;
+ if (res->end > p->end)
+ res->end = p->end;
return 0;
}
#endif
@@ -404,8 +407,6 @@ int insert_resource(struct resource *parent, struct resource *new)
return result;
}
-EXPORT_SYMBOL(insert_resource);
-
/*
* Given an existing resource, change its start and size to match the
* arguments. Returns -EBUSY if it can't fit. Existing children of
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 4aa8a2c9f45..0c1faa950af 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -26,6 +26,7 @@
#include <linux/interrupt.h>
#include <linux/plist.h>
#include <linux/fs.h>
+#include <linux/debug_locks.h>
#include "rtmutex_common.h"
@@ -45,8 +46,6 @@ do { \
console_verbose(); \
if (spin_is_locked(&current->pi_lock)) \
spin_unlock(&current->pi_lock); \
- if (spin_is_locked(&current->held_list_lock)) \
- spin_unlock(&current->held_list_lock); \
} \
} while (0)
@@ -97,7 +96,7 @@ void deadlock_trace_off(void)
rt_trace_on = 0;
}
-static void printk_task(task_t *p)
+static void printk_task(struct task_struct *p)
{
if (p)
printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
@@ -105,14 +104,6 @@ static void printk_task(task_t *p)
printk("<none>");
}
-static void printk_task_short(task_t *p)
-{
- if (p)
- printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
- else
- printk("<none>");
-}
-
static void printk_lock(struct rt_mutex *lock, int print_owner)
{
if (lock->name)
@@ -128,222 +119,6 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
printk_task(rt_mutex_owner(lock));
printk("\n");
}
- if (rt_mutex_owner(lock)) {
- printk("... acquired at: ");
- print_symbol("%s\n", lock->acquire_ip);
- }
-}
-
-static void printk_waiter(struct rt_mutex_waiter *w)
-{
- printk("-------------------------\n");
- printk("| waiter struct %p:\n", w);
- printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
- w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
- w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
- w->list_entry.prio);
- printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
- w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
- w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
- w->pi_list_entry.prio);
- printk("\n| lock:\n");
- printk_lock(w->lock, 1);
- printk("| w->ti->task:\n");
- printk_task(w->task);
- printk("| blocked at: ");
- print_symbol("%s\n", w->ip);
- printk("-------------------------\n");
-}
-
-static void show_task_locks(task_t *p)
-{
- switch (p->state) {
- case TASK_RUNNING: printk("R"); break;
- case TASK_INTERRUPTIBLE: printk("S"); break;
- case TASK_UNINTERRUPTIBLE: printk("D"); break;
- case TASK_STOPPED: printk("T"); break;
- case EXIT_ZOMBIE: printk("Z"); break;
- case EXIT_DEAD: printk("X"); break;
- default: printk("?"); break;
- }
- printk_task(p);
- if (p->pi_blocked_on) {
- struct rt_mutex *lock = p->pi_blocked_on->lock;
-
- printk(" blocked on:");
- printk_lock(lock, 1);
- } else
- printk(" (not blocked)\n");
-}
-
-void rt_mutex_show_held_locks(task_t *task, int verbose)
-{
- struct list_head *curr, *cursor = NULL;
- struct rt_mutex *lock;
- task_t *t;
- unsigned long flags;
- int count = 0;
-
- if (!rt_trace_on)
- return;
-
- if (verbose) {
- printk("------------------------------\n");
- printk("| showing all locks held by: | (");
- printk_task_short(task);
- printk("):\n");
- printk("------------------------------\n");
- }
-
-next:
- spin_lock_irqsave(&task->held_list_lock, flags);
- list_for_each(curr, &task->held_list_head) {
- if (cursor && curr != cursor)
- continue;
- lock = list_entry(curr, struct rt_mutex, held_list_entry);
- t = rt_mutex_owner(lock);
- WARN_ON(t != task);
- count++;
- cursor = curr->next;
- spin_unlock_irqrestore(&task->held_list_lock, flags);
-
- printk("\n#%03d: ", count);
- printk_lock(lock, 0);
- goto next;
- }
- spin_unlock_irqrestore(&task->held_list_lock, flags);
-
- printk("\n");
-}
-
-void rt_mutex_show_all_locks(void)
-{
- task_t *g, *p;
- int count = 10;
- int unlock = 1;
-
- printk("\n");
- printk("----------------------\n");
- printk("| showing all tasks: |\n");
- printk("----------------------\n");
-
- /*
- * Here we try to get the tasklist_lock as hard as possible,
- * if not successful after 2 seconds we ignore it (but keep
- * trying). This is to enable a debug printout even if a
- * tasklist_lock-holding task deadlocks or crashes.
- */
-retry:
- if (!read_trylock(&tasklist_lock)) {
- if (count == 10)
- printk("hm, tasklist_lock locked, retrying... ");
- if (count) {
- count--;
- printk(" #%d", 10-count);
- mdelay(200);
- goto retry;
- }
- printk(" ignoring it.\n");
- unlock = 0;
- }
- if (count != 10)
- printk(" locked it.\n");
-
- do_each_thread(g, p) {
- show_task_locks(p);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
-
- printk("\n");
-
- printk("-----------------------------------------\n");
- printk("| showing all locks held in the system: |\n");
- printk("-----------------------------------------\n");
-
- do_each_thread(g, p) {
- rt_mutex_show_held_locks(p, 0);
- if (!unlock)
- if (read_trylock(&tasklist_lock))
- unlock = 1;
- } while_each_thread(g, p);
-
-
- printk("=============================================\n\n");
-
- if (unlock)
- read_unlock(&tasklist_lock);
-}
-
-void rt_mutex_debug_check_no_locks_held(task_t *task)
-{
- struct rt_mutex_waiter *w;
- struct list_head *curr;
- struct rt_mutex *lock;
-
- if (!rt_trace_on)
- return;
- if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
- printk("BUG: PI priority boost leaked!\n");
- printk_task(task);
- printk("\n");
- }
- if (list_empty(&task->held_list_head))
- return;
-
- spin_lock(&task->pi_lock);
- plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
- TRACE_OFF();
-
- printk("hm, PI interest held at exit time? Task:\n");
- printk_task(task);
- printk_waiter(w);
- return;
- }
- spin_unlock(&task->pi_lock);
-
- list_for_each(curr, &task->held_list_head) {
- lock = list_entry(curr, struct rt_mutex, held_list_entry);
-
- printk("BUG: %s/%d, lock held at task exit time!\n",
- task->comm, task->pid);
- printk_lock(lock, 1);
- if (rt_mutex_owner(lock) != task)
- printk("exiting task is not even the owner??\n");
- }
-}
-
-int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
-{
- const void *to = from + len;
- struct list_head *curr;
- struct rt_mutex *lock;
- unsigned long flags;
- void *lock_addr;
-
- if (!rt_trace_on)
- return 0;
-
- spin_lock_irqsave(&current->held_list_lock, flags);
- list_for_each(curr, &current->held_list_head) {
- lock = list_entry(curr, struct rt_mutex, held_list_entry);
- lock_addr = lock;
- if (lock_addr < from || lock_addr >= to)
- continue;
- TRACE_OFF();
-
- printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
- current->comm, current->pid, lock, from, to);
- dump_stack();
- printk_lock(lock, 1);
- if (rt_mutex_owner(lock) != current)
- printk("freeing task is not even the owner??\n");
- return 1;
- }
- spin_unlock_irqrestore(&current->held_list_lock, flags);
-
- return 0;
}
void rt_mutex_debug_task_free(struct task_struct *task)
@@ -395,85 +170,41 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
current->comm, current->pid);
printk_lock(waiter->lock, 1);
- printk("... trying at: ");
- print_symbol("%s\n", waiter->ip);
-
printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
printk_lock(waiter->deadlock_lock, 1);
- rt_mutex_show_held_locks(current, 1);
- rt_mutex_show_held_locks(task, 1);
+ debug_show_held_locks(current);
+ debug_show_held_locks(task);
printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
show_stack(task, NULL);
printk("\n%s/%d's [current] stackdump:\n\n",
current->comm, current->pid);
dump_stack();
- rt_mutex_show_all_locks();
+ debug_show_all_locks();
+
printk("[ turning off deadlock detection."
"Please report this trace. ]\n\n");
local_irq_disable();
}
-void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
+void debug_rt_mutex_lock(struct rt_mutex *lock)
{
- unsigned long flags;
-
- if (rt_trace_on) {
- TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
-
- spin_lock_irqsave(&current->held_list_lock, flags);
- list_add_tail(&lock->held_list_entry, &current->held_list_head);
- spin_unlock_irqrestore(&current->held_list_lock, flags);
-
- lock->acquire_ip = ip;
- }
}
void debug_rt_mutex_unlock(struct rt_mutex *lock)
{
- unsigned long flags;
-
- if (rt_trace_on) {
- TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
- TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
-
- spin_lock_irqsave(&current->held_list_lock, flags);
- list_del_init(&lock->held_list_entry);
- spin_unlock_irqrestore(&current->held_list_lock, flags);
- }
+ TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
}
-void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner __IP_DECL__)
+void
+debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
{
- unsigned long flags;
-
- if (rt_trace_on) {
- TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
-
- spin_lock_irqsave(&powner->held_list_lock, flags);
- list_add_tail(&lock->held_list_entry, &powner->held_list_head);
- spin_unlock_irqrestore(&powner->held_list_lock, flags);
-
- lock->acquire_ip = ip;
- }
}
void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
{
- unsigned long flags;
-
- if (rt_trace_on) {
- struct task_struct *owner = rt_mutex_owner(lock);
-
- TRACE_WARN_ON_LOCKED(!owner);
- TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
-
- spin_lock_irqsave(&owner->held_list_lock, flags);
- list_del_init(&lock->held_list_entry);
- spin_unlock_irqrestore(&owner->held_list_lock, flags);
- }
+ TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
}
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -493,17 +224,15 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
{
- void *addr = lock;
-
- if (rt_trace_on) {
- rt_mutex_debug_check_no_locks_freed(addr,
- sizeof(struct rt_mutex));
- INIT_LIST_HEAD(&lock->held_list_entry);
- lock->name = name;
- }
+ /*
+ * Make sure we are not reinitializing a held lock:
+ */
+ debug_check_no_locks_freed((void *)lock, sizeof(*lock));
+ lock->name = name;
}
-void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
+void
+rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
{
}
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
index 7612fbc62d7..14193d596d7 100644
--- a/kernel/rtmutex-debug.h
+++ b/kernel/rtmutex-debug.h
@@ -9,20 +9,16 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-#define __IP_DECL__ , unsigned long ip
-#define __IP__ , ip
-#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
-
extern void
rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
-extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
+extern void debug_rt_mutex_lock(struct rt_mutex *lock);
extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
- struct task_struct *powner __IP_DECL__);
+ struct task_struct *powner);
extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
struct rt_mutex *lock);
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e82c2f84824..948bd8f643e 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -33,7 +33,7 @@ struct test_thread_data {
};
static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static task_t *threads[MAX_RT_TEST_THREADS];
+static struct task_struct *threads[MAX_RT_TEST_THREADS];
static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
enum test_opcodes {
@@ -275,6 +275,7 @@ static int test_func(void *data)
/* Wait for the next command to be executed */
schedule();
+ try_to_freeze();
if (signal_pending(current))
flush_signals(current);
@@ -361,8 +362,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
{
struct test_thread_data *td;
+ struct task_struct *tsk;
char *curr = buf;
- task_t *tsk;
int i;
td = container_of(dev, struct test_thread_data, sysdev);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 45d61016da5..3e13a1e5856 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
* Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
+ *
+ * See Documentation/rt-mutex-design.txt for details.
*/
#include <linux/spinlock.h>
#include <linux/module.h>
@@ -157,12 +159,11 @@ int max_lock_depth = 1024;
* Decreases task's usage by one - may thus free the task.
* Returns 0 or -EDEADLK.
*/
-static int rt_mutex_adjust_prio_chain(task_t *task,
+static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int deadlock_detect,
struct rt_mutex *orig_lock,
struct rt_mutex_waiter *orig_waiter,
- struct task_struct *top_task
- __IP_DECL__)
+ struct task_struct *top_task)
{
struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -283,6 +284,7 @@ static int rt_mutex_adjust_prio_chain(task_t *task,
spin_unlock_irqrestore(&task->pi_lock, flags);
out_put_task:
put_task_struct(task);
+
return ret;
}
@@ -357,7 +359,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
*
* Must be called with lock->wait_lock held.
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
+static int try_to_take_rt_mutex(struct rt_mutex *lock)
{
/*
* We have to be careful here if the atomic speedups are
@@ -384,7 +386,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
return 0;
/* We got the lock. */
- debug_rt_mutex_lock(lock __IP__);
+ debug_rt_mutex_lock(lock);
rt_mutex_set_owner(lock, current, 0);
@@ -402,13 +404,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- int detect_deadlock
- __IP_DECL__)
+ int detect_deadlock)
{
+ struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- task_t *owner = rt_mutex_owner(lock);
- int boost = 0, res;
unsigned long flags;
+ int boost = 0, res;
spin_lock_irqsave(&current->pi_lock, flags);
__rt_mutex_adjust_prio(current);
@@ -454,7 +455,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
spin_unlock(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- current __IP__);
+ current);
spin_lock(&lock->wait_lock);
@@ -526,12 +527,12 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* Must be called with lock->wait_lock held
*/
static void remove_waiter(struct rt_mutex *lock,
- struct rt_mutex_waiter *waiter __IP_DECL__)
+ struct rt_mutex_waiter *waiter)
{
int first = (waiter == rt_mutex_top_waiter(lock));
- int boost = 0;
- task_t *owner = rt_mutex_owner(lock);
+ struct task_struct *owner = rt_mutex_owner(lock);
unsigned long flags;
+ int boost = 0;
spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
@@ -568,7 +569,7 @@ static void remove_waiter(struct rt_mutex *lock,
spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
+ rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
spin_lock(&lock->wait_lock);
}
@@ -595,7 +596,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
get_task_struct(task);
spin_unlock_irqrestore(&task->pi_lock, flags);
- rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
+ rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
}
/*
@@ -604,7 +605,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock __IP_DECL__)
+ int detect_deadlock)
{
struct rt_mutex_waiter waiter;
int ret = 0;
@@ -615,7 +616,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
spin_lock(&lock->wait_lock);
/* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock __IP__)) {
+ if (try_to_take_rt_mutex(lock)) {
spin_unlock(&lock->wait_lock);
return 0;
}
@@ -629,7 +630,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock __IP__))
+ if (try_to_take_rt_mutex(lock))
break;
/*
@@ -653,7 +654,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
if (!waiter.task) {
ret = task_blocks_on_rt_mutex(lock, &waiter,
- detect_deadlock __IP__);
+ detect_deadlock);
/*
* If we got woken up by the owner then start loop
* all over without going into schedule to try
@@ -680,7 +681,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(TASK_RUNNING);
if (unlikely(waiter.task))
- remove_waiter(lock, &waiter __IP__);
+ remove_waiter(lock, &waiter);
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -711,7 +712,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
* Slow path try-lock function:
*/
static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
+rt_mutex_slowtrylock(struct rt_mutex *lock)
{
int ret = 0;
@@ -719,7 +720,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
if (likely(rt_mutex_owner(lock) != current)) {
- ret = try_to_take_rt_mutex(lock __IP__);
+ ret = try_to_take_rt_mutex(lock);
/*
* try_to_take_rt_mutex() sets the lock waiters
* bit unconditionally. Clean this up.
@@ -769,13 +770,13 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
int detect_deadlock,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock __IP_DECL__))
+ int detect_deadlock))
{
if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
+ return slowfn(lock, state, NULL, detect_deadlock);
}
static inline int
@@ -783,24 +784,24 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout, int detect_deadlock,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock __IP_DECL__))
+ int detect_deadlock))
{
if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
+ return slowfn(lock, state, timeout, detect_deadlock);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
+ int (*slowfn)(struct rt_mutex *lock))
{
if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 1;
}
- return slowfn(lock __RET_IP__);
+ return slowfn(lock);
}
static inline void
@@ -948,7 +949,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
__rt_mutex_init(lock, NULL);
- debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner, 0);
rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
index 1e0fca13ff7..a1a1dd06421 100644
--- a/kernel/rtmutex.h
+++ b/kernel/rtmutex.h
@@ -10,9 +10,6 @@
* Non-debug version.
*/
-#define __IP_DECL__
-#define __IP__
-#define __RET_IP__
#define rt_mutex_deadlock_check(l) (0)
#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 00000000000..291ded556aa
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,147 @@
+/* kernel/rwsem.c: R/W semaphores, public implementation
+ *
+ * Written by David Howells (dhowells@redhat.com).
+ * Derived from asm-i386/semaphore.h
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rwsem.h>
+
+#include <asm/system.h>
+#include <asm/atomic.h>
+
+/*
+ * lock for reading
+ */
+void down_read(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int down_read_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_read_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_read_trylock);
+
+/*
+ * lock for writing
+ */
+void down_write(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ __down_write(sem);
+}
+
+EXPORT_SYMBOL(down_write);
+
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+int down_write_trylock(struct rw_semaphore *sem)
+{
+ int ret = __down_write_trylock(sem);
+
+ if (ret == 1)
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ return ret;
+}
+
+EXPORT_SYMBOL(down_write_trylock);
+
+/*
+ * release a read lock
+ */
+void up_read(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read);
+
+/*
+ * release a write lock
+ */
+void up_write(struct rw_semaphore *sem)
+{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
+ __up_write(sem);
+}
+
+EXPORT_SYMBOL(up_write);
+
+/*
+ * downgrade write lock to read lock
+ */
+void downgrade_write(struct rw_semaphore *sem)
+{
+ /*
+ * lockdep: a downgraded write will live on as a write
+ * dependency.
+ */
+ __downgrade_write(sem);
+}
+
+EXPORT_SYMBOL(downgrade_write);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void down_read_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_nested);
+
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
+void down_write_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ __down_write_nested(sem, subclass);
+}
+
+EXPORT_SYMBOL(down_write_nested);
+
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
+#endif
+
+
diff --git a/kernel/sched.c b/kernel/sched.c
index d5e37072ea5..a234fbee123 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
@@ -50,6 +51,7 @@
#include <linux/times.h>
#include <linux/acct.h>
#include <linux/kprobes.h>
+#include <linux/delayacct.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
@@ -178,20 +180,15 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
-static inline unsigned int task_timeslice(task_t *p)
+static inline unsigned int task_timeslice(struct task_struct *p)
{
return static_prio_timeslice(p->static_prio);
}
-#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
- < (long long) (sd)->cache_hot_time)
-
/*
* These are the runqueue data structures:
*/
-typedef struct runqueue runqueue_t;
-
struct prio_array {
unsigned int nr_active;
DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
@@ -205,7 +202,7 @@ struct prio_array {
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
-struct runqueue {
+struct rq {
spinlock_t lock;
/*
@@ -229,9 +226,9 @@ struct runqueue {
unsigned long expired_timestamp;
unsigned long long timestamp_last_tick;
- task_t *curr, *idle;
+ struct task_struct *curr, *idle;
struct mm_struct *prev_mm;
- prio_array_t *active, *expired, arrays[2];
+ struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
atomic_t nr_iowait;
@@ -242,7 +239,7 @@ struct runqueue {
int active_balance;
int push_cpu;
- task_t *migration_thread;
+ struct task_struct *migration_thread;
struct list_head migration_queue;
#endif
@@ -265,9 +262,10 @@ struct runqueue {
unsigned long ttwu_cnt;
unsigned long ttwu_local;
#endif
+ struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct runqueue, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues);
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -276,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
-#define for_each_domain(cpu, domain) \
-for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
@@ -292,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
spin_unlock_irq(&rq->lock);
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(runqueue_t *rq, task_t *p)
+static inline int task_running(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->oncpu;
@@ -320,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
#endif
}
-static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
@@ -337,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
#endif
}
-static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
@@ -358,10 +363,10 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/
-static inline runqueue_t *__task_rq_lock(task_t *p)
+static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
- struct runqueue *rq;
+ struct rq *rq;
repeat_lock_task:
rq = task_rq(p);
@@ -378,10 +383,10 @@ repeat_lock_task:
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
-static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(rq->lock)
{
- struct runqueue *rq;
+ struct rq *rq;
repeat_lock_task:
local_irq_save(*flags);
@@ -394,13 +399,13 @@ repeat_lock_task:
return rq;
}
-static inline void __task_rq_unlock(runqueue_t *rq)
+static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
}
-static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
+static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
__releases(rq->lock)
{
spin_unlock_irqrestore(&rq->lock, *flags);
@@ -420,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
for_each_online_cpu(cpu) {
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
@@ -497,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
.release = single_release,
};
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+ if (rq) {
+ rq->rq_sched_info.run_delay += delta_jiffies;
+ rq->rq_sched_info.pcnt++;
+ }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+ if (rq)
+ rq->rq_sched_info.cpu_time += delta_jiffies;
+}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
#else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
#endif
@@ -507,10 +539,10 @@ struct file_operations proc_schedstat_operations = {
/*
* rq_lock - lock a given runqueue and disable interrupts.
*/
-static inline runqueue_t *this_rq_lock(void)
+static inline struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
- runqueue_t *rq;
+ struct rq *rq;
local_irq_disable();
rq = this_rq();
@@ -519,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void)
return rq;
}
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
@@ -535,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void)
* long it was from the *first* time it was queued to the time that it
* finally hit a cpu.
*/
-static inline void sched_info_dequeued(task_t *t)
+static inline void sched_info_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
}
@@ -545,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t)
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
-static void sched_info_arrive(task_t *t)
+static void sched_info_arrive(struct task_struct *t)
{
- unsigned long now = jiffies, diff = 0;
- struct runqueue *rq = task_rq(t);
+ unsigned long now = jiffies, delta_jiffies = 0;
if (t->sched_info.last_queued)
- diff = now - t->sched_info.last_queued;
+ delta_jiffies = now - t->sched_info.last_queued;
sched_info_dequeued(t);
- t->sched_info.run_delay += diff;
+ t->sched_info.run_delay += delta_jiffies;
t->sched_info.last_arrival = now;
t->sched_info.pcnt++;
- if (!rq)
- return;
-
- rq->rq_sched_info.run_delay += diff;
- rq->rq_sched_info.pcnt++;
+ rq_sched_info_arrive(task_rq(t), delta_jiffies);
}
/*
@@ -579,25 +606,23 @@ static void sched_info_arrive(task_t *t)
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
-static inline void sched_info_queued(task_t *t)
+static inline void sched_info_queued(struct task_struct *t)
{
- if (!t->sched_info.last_queued)
- t->sched_info.last_queued = jiffies;
+ if (unlikely(sched_info_on()))
+ if (!t->sched_info.last_queued)
+ t->sched_info.last_queued = jiffies;
}
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
*/
-static inline void sched_info_depart(task_t *t)
+static inline void sched_info_depart(struct task_struct *t)
{
- struct runqueue *rq = task_rq(t);
- unsigned long diff = jiffies - t->sched_info.last_arrival;
+ unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
- t->sched_info.cpu_time += diff;
-
- if (rq)
- rq->rq_sched_info.cpu_time += diff;
+ t->sched_info.cpu_time += delta_jiffies;
+ rq_sched_info_depart(task_rq(t), delta_jiffies);
}
/*
@@ -605,9 +630,10 @@ static inline void sched_info_depart(task_t *t)
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
-static inline void sched_info_switch(task_t *prev, task_t *next)
+static inline void
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
- struct runqueue *rq = task_rq(prev);
+ struct rq *rq = task_rq(prev);
/*
* prev now departs the cpu. It's not interesting to record
@@ -620,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next)
if (next != rq->idle)
sched_info_arrive(next);
}
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+ if (unlikely(sched_info_on()))
+ __sched_info_switch(prev, next);
+}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
* Adding/removing a task to/from a priority array:
*/
-static void dequeue_task(struct task_struct *p, prio_array_t *array)
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
{
array->nr_active--;
list_del(&p->run_list);
@@ -636,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
__clear_bit(p->prio, array->bitmap);
}
-static void enqueue_task(struct task_struct *p, prio_array_t *array)
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
sched_info_queued(p);
list_add_tail(&p->run_list, array->queue + p->prio);
@@ -649,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
*/
-static void requeue_task(struct task_struct *p, prio_array_t *array)
+static void requeue_task(struct task_struct *p, struct prio_array *array)
{
list_move_tail(&p->run_list, array->queue + p->prio);
}
-static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
{
list_add(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
@@ -677,7 +710,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
* Both properties are important to certain workloads.
*/
-static inline int __normal_prio(task_t *p)
+static inline int __normal_prio(struct task_struct *p)
{
int bonus, prio;
@@ -713,7 +746,7 @@ static inline int __normal_prio(task_t *p)
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
-static void set_load_weight(task_t *p)
+static void set_load_weight(struct task_struct *p)
{
if (has_rt_policy(p)) {
#ifdef CONFIG_SMP
@@ -731,23 +764,25 @@ static void set_load_weight(task_t *p)
p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
}
-static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
+static inline void
+inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
rq->raw_weighted_load += p->load_weight;
}
-static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
+static inline void
+dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
rq->raw_weighted_load -= p->load_weight;
}
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
inc_raw_weighted_load(rq, p);
}
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
dec_raw_weighted_load(rq, p);
@@ -760,7 +795,7 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
-static inline int normal_prio(task_t *p)
+static inline int normal_prio(struct task_struct *p)
{
int prio;
@@ -778,7 +813,7 @@ static inline int normal_prio(task_t *p)
* interactivity modifiers. Will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
-static int effective_prio(task_t *p)
+static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
/*
@@ -794,9 +829,9 @@ static int effective_prio(task_t *p)
/*
* __activate_task - move a task to the runqueue.
*/
-static void __activate_task(task_t *p, runqueue_t *rq)
+static void __activate_task(struct task_struct *p, struct rq *rq)
{
- prio_array_t *target = rq->active;
+ struct prio_array *target = rq->active;
if (batch_task(p))
target = rq->expired;
@@ -807,7 +842,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
/*
* __activate_idle_task - move idle task to the _front_ of runqueue.
*/
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
{
enqueue_task_head(p, rq->active);
inc_nr_running(p, rq);
@@ -817,7 +852,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
* Recalculate p->normal_prio and p->prio after having slept,
* updating the sleep-average too:
*/
-static int recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(struct task_struct *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long sleep_time = now - p->timestamp;
@@ -889,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
* Update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
-static void activate_task(task_t *p, runqueue_t *rq, int local)
+static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
unsigned long long now;
@@ -897,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
- runqueue_t *this_rq = this_rq();
+ struct rq *this_rq = this_rq();
now = (now - this_rq->timestamp_last_tick)
+ rq->timestamp_last_tick;
}
@@ -936,7 +971,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static void deactivate_task(struct task_struct *p, struct rq *rq)
{
dec_nr_running(p, rq);
dequeue_task(p, p->array);
@@ -956,7 +991,7 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif
-static void resched_task(task_t *p)
+static void resched_task(struct task_struct *p)
{
int cpu;
@@ -977,7 +1012,7 @@ static void resched_task(task_t *p)
smp_send_reschedule(cpu);
}
#else
-static inline void resched_task(task_t *p)
+static inline void resched_task(struct task_struct *p)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
@@ -988,7 +1023,7 @@ static inline void resched_task(task_t *p)
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
*/
-inline int task_curr(const task_t *p)
+inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
@@ -1000,22 +1035,23 @@ unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_SMP
-typedef struct {
+struct migration_req {
struct list_head list;
- task_t *task;
+ struct task_struct *task;
int dest_cpu;
struct completion done;
-} migration_req_t;
+};
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
+static int
+migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
{
- runqueue_t *rq = task_rq(p);
+ struct rq *rq = task_rq(p);
/*
* If the task is not on a runqueue (and not running), then
@@ -1030,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
+
return 1;
}
@@ -1042,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t *p)
+void wait_task_inactive(struct task_struct *p)
{
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
int preempted;
repeat:
@@ -1076,7 +1113,7 @@ repeat:
* to another CPU then no harm is done and the purpose has been
* achieved as well.
*/
-void kick_process(task_t *p)
+void kick_process(struct task_struct *p)
{
int cpu;
@@ -1096,7 +1133,7 @@ void kick_process(task_t *p)
*/
static inline unsigned long source_load(int cpu, int type)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
if (type == 0)
return rq->raw_weighted_load;
@@ -1110,7 +1147,7 @@ static inline unsigned long source_load(int cpu, int type)
*/
static inline unsigned long target_load(int cpu, int type)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
if (type == 0)
return rq->raw_weighted_load;
@@ -1123,10 +1160,10 @@ static inline unsigned long target_load(int cpu, int type)
*/
static inline unsigned long cpu_avg_load_per_task(int cpu)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
unsigned long n = rq->nr_running;
- return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
+ return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
@@ -1279,7 +1316,7 @@ nextlevel:
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, task_t *p)
+static int wake_idle(int cpu, struct task_struct *p)
{
cpumask_t tmp;
struct sched_domain *sd;
@@ -1302,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p)
return cpu;
}
#else
-static inline int wake_idle(int cpu, task_t *p)
+static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
@@ -1322,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p)
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
- runqueue_t *rq;
+ struct rq *rq;
#ifdef CONFIG_SMP
- unsigned long load, this_load;
struct sched_domain *sd, *this_sd = NULL;
+ unsigned long load, this_load;
int new_cpu;
#endif
@@ -1480,15 +1517,14 @@ out:
return success;
}
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
}
-
EXPORT_SYMBOL(wake_up_process);
-int fastcall wake_up_state(task_t *p, unsigned int state)
+int fastcall wake_up_state(struct task_struct *p, unsigned int state)
{
return try_to_wake_up(p, state, 0);
}
@@ -1497,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*/
-void fastcall sched_fork(task_t *p, int clone_flags)
+void fastcall sched_fork(struct task_struct *p, int clone_flags)
{
int cpu = get_cpu();
@@ -1521,8 +1557,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
- memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
p->oncpu = 0;
@@ -1565,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
+void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
+ struct rq *rq, *this_rq;
unsigned long flags;
int this_cpu, cpu;
- runqueue_t *rq, *this_rq;
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
@@ -1649,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t *p)
+void fastcall sched_exit(struct task_struct *p)
{
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
/*
* If the child was a (relative-) CPU hog then decrease
@@ -1683,7 +1720,7 @@ void fastcall sched_exit(task_t *p)
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
-static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
{
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
@@ -1704,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
+static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
@@ -1742,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(task_t *prev)
+asmlinkage void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
+
finish_task_switch(rq, prev);
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
@@ -1759,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev)
* context_switch - switch to the new MM and the new
* thread's register state.
*/
-static inline
-task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
+static inline struct task_struct *
+context_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
@@ -1777,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
}
+ /*
+ * Since the runqueue lock will be released by the next
+ * task (which is an invalid locking op but in the case
+ * of the scheduler it's an obvious special-case), so we
+ * do an early lockdep release here:
+ */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -1857,12 +1905,21 @@ unsigned long nr_active(void)
#ifdef CONFIG_SMP
/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
+{
+ return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
+}
+
+/*
* double_rq_lock - safely lock two runqueues
*
* Note this does not disable interrupts like task_rq_lock,
* you need to do so manually before calling.
*/
-static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
__acquires(rq1->lock)
__acquires(rq2->lock)
{
@@ -1886,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
* Note this does not restore interrupts like task_rq_unlock,
* you need to do so manually after calling.
*/
-static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__releases(rq1->lock)
__releases(rq2->lock)
{
@@ -1900,7 +1957,7 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
-static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
+static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
@@ -1921,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
* the cpu_allowed mask is restored.
*/
-static void sched_migrate_task(task_t *p, int dest_cpu)
+static void sched_migrate_task(struct task_struct *p, int dest_cpu)
{
- migration_req_t req;
- runqueue_t *rq;
+ struct migration_req req;
unsigned long flags;
+ struct rq *rq;
rq = task_rq_lock(p, &flags);
if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1936,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
struct task_struct *mt = rq->migration_thread;
+
get_task_struct(mt);
task_rq_unlock(rq, &flags);
wake_up_process(mt);
put_task_struct(mt);
wait_for_completion(&req.done);
+
return;
}
out:
@@ -1964,9 +2023,9 @@ void sched_exec(void)
* pull_task - move a task from a remote runqueue to the local runqueue.
* Both runqueues must be locked.
*/
-static
-void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
- runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct prio_array *src_array,
+ struct task_struct *p, struct rq *this_rq,
+ struct prio_array *this_array, int this_cpu)
{
dequeue_task(p, src_array);
dec_nr_running(p, src_rq);
@@ -1987,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
-int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
+int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum idle_type idle,
int *all_pinned)
{
@@ -2019,6 +2078,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
}
#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+
/*
* move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
* load from busiest to this_rq, as part of a balancing operation within
@@ -2026,18 +2086,17 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
*
* Called with both runqueues locked.
*/
-static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_nr_move, unsigned long max_load_move,
struct sched_domain *sd, enum idle_type idle,
int *all_pinned)
{
- prio_array_t *array, *dst_array;
+ int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
+ best_prio_seen, skip_for_load;
+ struct prio_array *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
- int busiest_best_prio_seen;
- int skip_for_load; /* skip the task based on weighted load issues */
+ struct task_struct *tmp;
long rem_load_move;
- task_t *tmp;
if (max_nr_move == 0 || max_load_move == 0)
goto out;
@@ -2045,15 +2104,15 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
rem_load_move = max_load_move;
pinned = 1;
this_best_prio = rq_best_prio(this_rq);
- busiest_best_prio = rq_best_prio(busiest);
+ best_prio = rq_best_prio(busiest);
/*
* Enable handling of the case where there is more than one task
* with the best priority. If the current running task is one
- * of those with prio==busiest_best_prio we know it won't be moved
+ * of those with prio==best_prio we know it won't be moved
* and therefore it's safe to override the skip (based on load) of
* any task we find with that prio.
*/
- busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
+ best_prio_seen = best_prio == busiest->curr->prio;
/*
* We first consider expired tasks. Those will likely not be
@@ -2089,7 +2148,7 @@ skip_bitmap:
head = array->queue + idx;
curr = head->prev;
skip_queue:
- tmp = list_entry(curr, task_t, run_list);
+ tmp = list_entry(curr, struct task_struct, run_list);
curr = curr->prev;
@@ -2100,10 +2159,11 @@ skip_queue:
*/
skip_for_load = tmp->load_weight > rem_load_move;
if (skip_for_load && idx < this_best_prio)
- skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
+ skip_for_load = !best_prio_seen && idx == best_prio;
if (skip_for_load ||
!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
- busiest_best_prio_seen |= idx == busiest_best_prio;
+
+ best_prio_seen |= idx == best_prio;
if (curr != head)
goto skip_queue;
idx++;
@@ -2146,8 +2206,8 @@ out:
/*
* find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which should be
- * moved to restore balance via the imbalance parameter.
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -2188,7 +2248,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
sum_weighted_load = sum_nr_running = avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
- runqueue_t *rq = cpu_rq(i);
+ struct rq *rq = cpu_rq(i);
if (*sd_idle && !idle_cpu(i))
*sd_idle = 0;
@@ -2269,7 +2329,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* capacity but still has some space to pick up some load
* from other group and save more power
*/
- if (sum_nr_running <= group_capacity - 1)
+ if (sum_nr_running <= group_capacity - 1) {
if (sum_nr_running > leader_nr_running ||
(sum_nr_running == leader_nr_running &&
first_cpu(group->cpumask) >
@@ -2277,7 +2337,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
group_leader = group;
leader_nr_running = sum_nr_running;
}
-
+ }
group_next:
#endif
group = group->next;
@@ -2332,8 +2392,7 @@ group_next:
* moved
*/
if (*imbalance < busiest_load_per_task) {
- unsigned long pwr_now, pwr_move;
- unsigned long tmp;
+ unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
small_imbalance:
@@ -2405,22 +2464,23 @@ ret:
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
-static runqueue_t *find_busiest_queue(struct sched_group *group,
- enum idle_type idle, unsigned long imbalance)
+static struct rq *
+find_busiest_queue(struct sched_group *group, enum idle_type idle,
+ unsigned long imbalance)
{
+ struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
- runqueue_t *busiest = NULL, *rqi;
int i;
for_each_cpu_mask(i, group->cpumask) {
- rqi = cpu_rq(i);
+ rq = cpu_rq(i);
- if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
+ if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
continue;
- if (rqi->raw_weighted_load > max_load) {
- max_load = rqi->raw_weighted_load;
- busiest = rqi;
+ if (rq->raw_weighted_load > max_load) {
+ max_load = rq->raw_weighted_load;
+ busiest = rq;
}
}
@@ -2433,22 +2493,24 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
*/
#define MAX_PINNED_INTERVAL 512
-#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
+static inline unsigned long minus_1_or_zero(unsigned long n)
+{
+ return n > 0 ? n - 1 : 0;
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
* Called with this_rq unlocked.
*/
-static int load_balance(int this_cpu, runqueue_t *this_rq,
+static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum idle_type idle)
{
+ int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
- runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved, all_pinned = 0;
- int active_balance = 0;
- int sd_idle = 0;
+ struct rq *busiest;
if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
!sched_smt_power_savings)
@@ -2482,8 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/
double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- minus_1_or_zero(busiest->nr_running),
- imbalance, sd, idle, &all_pinned);
+ minus_1_or_zero(busiest->nr_running),
+ imbalance, sd, idle, &all_pinned);
double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
@@ -2556,7 +2618,8 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
return 0;
}
@@ -2568,11 +2631,11 @@ out_one_pinned:
* Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
* this_rq is locked.
*/
-static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
- struct sched_domain *sd)
+static int
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
{
struct sched_group *group;
- runqueue_t *busiest = NULL;
+ struct rq *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
@@ -2618,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !sched_smt_power_savings)
return -1;
sd->nr_balance_failed = 0;
+
return 0;
}
@@ -2628,16 +2693,15 @@ out_balanced:
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static void idle_balance(int this_cpu, runqueue_t *this_rq)
+static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) {
- if (load_balance_newidle(this_cpu, this_rq, sd)) {
- /* We've pulled tasks over so stop searching */
+ /* If we've pulled tasks over stop searching: */
+ if (load_balance_newidle(this_cpu, this_rq, sd))
break;
- }
}
}
}
@@ -2650,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
*
* Called with busiest_rq locked.
*/
-static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
+static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{
- struct sched_domain *sd;
- runqueue_t *target_rq;
int target_cpu = busiest_rq->push_cpu;
+ struct sched_domain *sd;
+ struct rq *target_rq;
+ /* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
- /* no task to move */
return;
target_rq = cpu_rq(target_cpu);
@@ -2675,21 +2739,20 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
- cpu_isset(busiest_cpu, sd->span))
+ cpu_isset(busiest_cpu, sd->span))
break;
}
- if (unlikely(sd == NULL))
- goto out;
-
- schedstat_inc(sd, alb_cnt);
+ if (likely(sd)) {
+ schedstat_inc(sd, alb_cnt);
- if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
- RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
- schedstat_inc(sd, alb_pushed);
- else
- schedstat_inc(sd, alb_failed);
-out:
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
+ RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
+ NULL))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
spin_unlock(&target_rq->lock);
}
@@ -2702,23 +2765,27 @@ out:
* Balancing parameters are set up in arch_init_sched_domains.
*/
-/* Don't have all balancing operations going off at once */
-#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
+/* Don't have all balancing operations going off at once: */
+static inline unsigned long cpu_offset(int cpu)
+{
+ return jiffies + cpu * HZ / NR_CPUS;
+}
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
- enum idle_type idle)
+static void
+rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
{
- unsigned long old_load, this_load;
- unsigned long j = jiffies + CPU_OFFSET(this_cpu);
+ unsigned long this_load, interval, j = cpu_offset(this_cpu);
struct sched_domain *sd;
- int i;
+ int i, scale;
this_load = this_rq->raw_weighted_load;
- /* Update our load */
- for (i = 0; i < 3; i++) {
- unsigned long new_load = this_load;
- int scale = 1 << i;
+
+ /* Update our load: */
+ for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
+ unsigned long old_load, new_load;
+
old_load = this_rq->cpu_load[i];
+ new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
* prevents us from getting stuck on 9 if the load is 10, for
@@ -2730,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
}
for_each_domain(this_cpu, sd) {
- unsigned long interval;
-
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2761,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
/*
* on UP we do not need to balance between CPUs:
*/
-static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
{
}
-static inline void idle_balance(int cpu, runqueue_t *rq)
+static inline void idle_balance(int cpu, struct rq *rq)
{
}
#endif
-static inline int wake_priority_sleeper(runqueue_t *rq)
+static inline int wake_priority_sleeper(struct rq *rq)
{
int ret = 0;
+
#ifdef CONFIG_SCHED_SMT
spin_lock(&rq->lock);
/*
@@ -2795,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat);
* This is called on clock ticks and on context switches.
* Bank in p->sched_time the ns elapsed since the last tick or switch.
*/
-static inline void update_cpu_clock(task_t *p, runqueue_t *rq,
- unsigned long long now)
+static inline void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
{
- unsigned long long last = max(p->timestamp, rq->timestamp_last_tick);
- p->sched_time += now - last;
+ p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
}
/*
* Return current->sched_time plus any more ns on the sched_clock
* that have not yet been banked.
*/
-unsigned long long current_sched_time(const task_t *tsk)
+unsigned long long current_sched_time(const struct task_struct *p)
{
unsigned long long ns;
unsigned long flags;
+
local_irq_save(flags);
- ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
- ns = tsk->sched_time + (sched_clock() - ns);
+ ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
+ ns = p->sched_time + sched_clock() - ns;
local_irq_restore(flags);
+
return ns;
}
@@ -2827,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk)
* increasing number of running tasks. We also ignore the interactivity
* if a better static_prio task has expired:
*/
-#define EXPIRED_STARVING(rq) \
- ((STARVATION_LIMIT && ((rq)->expired_timestamp && \
- (jiffies - (rq)->expired_timestamp >= \
- STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
- ((rq)->curr->static_prio > (rq)->best_expired_prio))
+static inline int expired_starving(struct rq *rq)
+{
+ if (rq->curr->static_prio > rq->best_expired_prio)
+ return 1;
+ if (!STARVATION_LIMIT || !rq->expired_timestamp)
+ return 0;
+ if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
+ return 1;
+ return 0;
+}
/*
* Account user cpu time to a process.
@@ -2864,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
cputime64_t tmp;
p->stime = cputime_add(p->stime, cputime);
@@ -2894,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
cputime64_t tmp = cputime_to_cputime64(steal);
- runqueue_t *rq = this_rq();
+ struct rq *rq = this_rq();
if (p == rq->idle) {
p->stime = cputime_add(p->stime, steal);
@@ -2915,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
*/
void scheduler_tick(void)
{
- int cpu = smp_processor_id();
- runqueue_t *rq = this_rq();
- task_t *p = current;
unsigned long long now = sched_clock();
+ struct task_struct *p = current;
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
@@ -2968,7 +3040,7 @@ void scheduler_tick(void)
if (!rq->expired_timestamp)
rq->expired_timestamp = jiffies;
- if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
+ if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
enqueue_task(p, rq->expired);
if (p->static_prio < rq->best_expired_prio)
rq->best_expired_prio = p->static_prio;
@@ -3007,7 +3079,7 @@ out:
}
#ifdef CONFIG_SCHED_SMT
-static inline void wakeup_busy_runqueue(runqueue_t *rq)
+static inline void wakeup_busy_runqueue(struct rq *rq)
{
/* If an SMT runqueue is sleeping due to priority reasons wake it up */
if (rq->curr == rq->idle && rq->nr_running)
@@ -3033,7 +3105,7 @@ static void wake_sleeping_dependent(int this_cpu)
return;
for_each_cpu_mask(i, sd->span) {
- runqueue_t *smt_rq = cpu_rq(i);
+ struct rq *smt_rq = cpu_rq(i);
if (i == this_cpu)
continue;
@@ -3050,7 +3122,8 @@ static void wake_sleeping_dependent(int this_cpu)
* utilize, if another task runs on a sibling. This models the
* slowdown effect of other tasks running on siblings:
*/
-static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+static inline unsigned long
+smt_slice(struct task_struct *p, struct sched_domain *sd)
{
return p->time_slice * (100 - sd->per_cpu_gain) / 100;
}
@@ -3061,7 +3134,8 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
* acquire their lock. As we only trylock the normal locking order does not
* need to be obeyed.
*/
-static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
+static int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
{
struct sched_domain *tmp, *sd = NULL;
int ret = 0, i;
@@ -3081,8 +3155,8 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
return 0;
for_each_cpu_mask(i, sd->span) {
- runqueue_t *smt_rq;
- task_t *smt_curr;
+ struct task_struct *smt_curr;
+ struct rq *smt_rq;
if (i == this_cpu)
continue;
@@ -3127,9 +3201,8 @@ unlock:
static inline void wake_sleeping_dependent(int this_cpu)
{
}
-
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
- task_t *p)
+static inline int
+dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
{
return 0;
}
@@ -3142,12 +3215,13 @@ void fastcall add_preempt_count(int val)
/*
* Underflow?
*/
- BUG_ON((preempt_count() < 0));
+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+ return;
preempt_count() += val;
/*
* Spinlock count overflowing soon?
*/
- BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
}
EXPORT_SYMBOL(add_preempt_count);
@@ -3156,11 +3230,15 @@ void fastcall sub_preempt_count(int val)
/*
* Underflow?
*/
- BUG_ON(val > preempt_count());
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ return;
/*
* Is the spinlock portion underflowing?
*/
- BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+ !(preempt_count() & PREEMPT_MASK)))
+ return;
+
preempt_count() -= val;
}
EXPORT_SYMBOL(sub_preempt_count);
@@ -3178,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
*/
asmlinkage void __sched schedule(void)
{
- long *switch_count;
- task_t *prev, *next;
- runqueue_t *rq;
- prio_array_t *array;
+ struct task_struct *prev, *next;
+ struct prio_array *array;
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
int cpu, idx, new_prio;
+ long *switch_count;
+ struct rq *rq;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -3275,7 +3353,7 @@ need_resched_nonpreemptible:
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
- next = list_entry(queue->next, task_t, run_list);
+ next = list_entry(queue->next, struct task_struct, run_list);
if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
unsigned long long delta = now - next->timestamp;
@@ -3338,12 +3416,11 @@ switch_tasks:
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
-
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_PREEMPT
/*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
@@ -3383,11 +3460,10 @@ need_resched:
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
-
EXPORT_SYMBOL(preempt_schedule);
/*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
* off of irq context.
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
@@ -3399,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
struct task_struct *task = current;
int saved_lock_depth;
#endif
- /* Catch callers which need to be fixed*/
+ /* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
need_resched:
@@ -3432,10 +3508,8 @@ need_resched:
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
void *key)
{
- task_t *p = curr->private;
- return try_to_wake_up(p, mode, sync);
+ return try_to_wake_up(curr->private, mode, sync);
}
-
EXPORT_SYMBOL(default_wake_function);
/*
@@ -3453,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
struct list_head *tmp, *next;
list_for_each_safe(tmp, next, &q->task_list) {
- wait_queue_t *curr;
- unsigned flags;
- curr = list_entry(tmp, wait_queue_t, task_list);
- flags = curr->flags;
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+ unsigned flags = curr->flags;
+
if (curr->func(curr, mode, sync, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) &&
- !--nr_exclusive)
+ (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
@@ -3480,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
__wake_up_common(q, mode, nr_exclusive, 0, key);
spin_unlock_irqrestore(&q->lock, flags);
}
-
EXPORT_SYMBOL(__wake_up);
/*
@@ -3549,6 +3620,7 @@ EXPORT_SYMBOL(complete_all);
void fastcall __sched wait_for_completion(struct completion *x)
{
might_sleep();
+
spin_lock_irq(&x->wait.lock);
if (!x->done) {
DECLARE_WAITQUEUE(wait, current);
@@ -3693,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
schedule();
SLEEP_ON_TAIL
}
-
EXPORT_SYMBOL(interruptible_sleep_on);
long fastcall __sched
@@ -3709,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
return timeout;
}
-
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
void fastcall __sched sleep_on(wait_queue_head_t *q)
@@ -3722,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q)
schedule();
SLEEP_ON_TAIL
}
-
EXPORT_SYMBOL(sleep_on);
long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
@@ -3752,11 +3821,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
*
* Used by the rt_mutex code to implement priority inheritance logic.
*/
-void rt_mutex_setprio(task_t *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, int prio)
{
+ struct prio_array *array;
unsigned long flags;
- prio_array_t *array;
- runqueue_t *rq;
+ struct rq *rq;
int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3793,12 +3862,12 @@ void rt_mutex_setprio(task_t *p, int prio)
#endif
-void set_user_nice(task_t *p, long nice)
+void set_user_nice(struct task_struct *p, long nice)
{
- unsigned long flags;
- prio_array_t *array;
- runqueue_t *rq;
+ struct prio_array *array;
int old_prio, delta;
+ unsigned long flags;
+ struct rq *rq;
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
return;
@@ -3849,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice);
* @p: task
* @nice: nice value
*/
-int can_nice(const task_t *p, const int nice)
+int can_nice(const struct task_struct *p, const int nice)
{
/* convert nice value [19,-20] to rlimit style value [1,40] */
int nice_rlim = 20 - nice;
+
return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
capable(CAP_SYS_NICE));
}
@@ -3868,8 +3938,7 @@ int can_nice(const task_t *p, const int nice)
*/
asmlinkage long sys_nice(int increment)
{
- int retval;
- long nice;
+ long nice, retval;
/*
* Setpriority might change our priority at the same moment.
@@ -3908,7 +3977,7 @@ asmlinkage long sys_nice(int increment)
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
-int task_prio(const task_t *p)
+int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;
}
@@ -3917,7 +3986,7 @@ int task_prio(const task_t *p)
* task_nice - return the nice value of a given task.
* @p: the task in question.
*/
-int task_nice(const task_t *p)
+int task_nice(const struct task_struct *p)
{
return TASK_NICE(p);
}
@@ -3936,7 +4005,7 @@ int idle_cpu(int cpu)
* idle_task - return the idle task for a given cpu.
* @cpu: the processor in question.
*/
-task_t *idle_task(int cpu)
+struct task_struct *idle_task(int cpu)
{
return cpu_rq(cpu)->idle;
}
@@ -3945,7 +4014,7 @@ task_t *idle_task(int cpu)
* find_process_by_pid - find a process with a matching PID value.
* @pid: the pid in question.
*/
-static inline task_t *find_process_by_pid(pid_t pid)
+static inline struct task_struct *find_process_by_pid(pid_t pid)
{
return pid ? find_task_by_pid(pid) : current;
}
@@ -3954,6 +4023,7 @@ static inline task_t *find_process_by_pid(pid_t pid)
static void __setscheduler(struct task_struct *p, int policy, int prio)
{
BUG_ON(p->array);
+
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
@@ -3977,11 +4047,10 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
- int retval;
- int oldprio, oldpolicy = -1;
- prio_array_t *array;
+ int retval, oldprio, oldpolicy = -1;
+ struct prio_array *array;
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -4079,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
- int retval;
struct sched_param lparam;
struct task_struct *p;
+ int retval;
if (!param || pid < 0)
return -EINVAL;
@@ -4093,10 +4162,9 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
read_unlock_irq(&tasklist_lock);
return -ESRCH;
}
- get_task_struct(p);
- read_unlock_irq(&tasklist_lock);
retval = sched_setscheduler(p, policy, &lparam);
- put_task_struct(p);
+ read_unlock_irq(&tasklist_lock);
+
return retval;
}
@@ -4132,8 +4200,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
*/
asmlinkage long sys_sched_getscheduler(pid_t pid)
{
+ struct task_struct *p;
int retval = -EINVAL;
- task_t *p;
if (pid < 0)
goto out_nounlock;
@@ -4160,8 +4228,8 @@ out_nounlock:
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
{
struct sched_param lp;
+ struct task_struct *p;
int retval = -EINVAL;
- task_t *p;
if (!param || pid < 0)
goto out_nounlock;
@@ -4194,9 +4262,9 @@ out_unlock:
long sched_setaffinity(pid_t pid, cpumask_t new_mask)
{
- task_t *p;
- int retval;
cpumask_t cpus_allowed;
+ struct task_struct *p;
+ int retval;
lock_cpu_hotplug();
read_lock(&tasklist_lock);
@@ -4282,8 +4350,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
long sched_getaffinity(pid_t pid, cpumask_t *mask)
{
+ struct task_struct *p;
int retval;
- task_t *p;
lock_cpu_hotplug();
read_lock(&tasklist_lock);
@@ -4342,9 +4410,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
*/
asmlinkage long sys_sched_yield(void)
{
- runqueue_t *rq = this_rq_lock();
- prio_array_t *array = current->array;
- prio_array_t *target = rq->expired;
+ struct rq *rq = this_rq_lock();
+ struct prio_array *array = current->array, *target = rq->expired;
schedstat_inc(rq, yld_cnt);
/*
@@ -4378,6 +4445,7 @@ asmlinkage long sys_sched_yield(void)
* no need to preempt or enable interrupts:
*/
__release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
_raw_spin_unlock(&rq->lock);
preempt_enable_no_resched();
@@ -4386,9 +4454,9 @@ asmlinkage long sys_sched_yield(void)
return 0;
}
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
{
- if (unlikely(preempt_count()))
+ if (unlikely(preempt_count() != expected_preempt_count))
return 0;
if (unlikely(system_state != SYSTEM_RUNNING))
return 0;
@@ -4414,7 +4482,7 @@ static void __cond_resched(void)
int __sched cond_resched(void)
{
- if (need_resched() && __resched_legal()) {
+ if (need_resched() && __resched_legal(0)) {
__cond_resched();
return 1;
}
@@ -4440,7 +4508,8 @@ int cond_resched_lock(spinlock_t *lock)
ret = 1;
spin_lock(lock);
}
- if (need_resched() && __resched_legal()) {
+ if (need_resched() && __resched_legal(1)) {
+ spin_release(&lock->dep_map, 1, _THIS_IP_);
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
@@ -4455,8 +4524,10 @@ int __sched cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (need_resched() && __resched_legal()) {
- __local_bh_enable();
+ if (need_resched() && __resched_legal(0)) {
+ raw_local_irq_disable();
+ _local_bh_enable();
+ raw_local_irq_enable();
__cond_resched();
local_bh_disable();
return 1;
@@ -4476,7 +4547,6 @@ void __sched yield(void)
set_current_state(TASK_RUNNING);
sys_sched_yield();
}
-
EXPORT_SYMBOL(yield);
/*
@@ -4488,23 +4558,26 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct runqueue *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
schedule();
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
}
-
EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct runqueue *rq = &__raw_get_cpu_var(runqueues);
+ struct rq *rq = &__raw_get_cpu_var(runqueues);
long ret;
+ delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
ret = schedule_timeout(timeout);
atomic_dec(&rq->nr_iowait);
+ delayacct_blkio_end();
return ret;
}
@@ -4566,9 +4639,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
asmlinkage
long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
{
+ struct task_struct *p;
int retval = -EINVAL;
struct timespec t;
- task_t *p;
if (pid < 0)
goto out_nounlock;
@@ -4596,35 +4669,36 @@ out_unlock:
static inline struct task_struct *eldest_child(struct task_struct *p)
{
- if (list_empty(&p->children)) return NULL;
+ if (list_empty(&p->children))
+ return NULL;
return list_entry(p->children.next,struct task_struct,sibling);
}
static inline struct task_struct *older_sibling(struct task_struct *p)
{
- if (p->sibling.prev==&p->parent->children) return NULL;
+ if (p->sibling.prev==&p->parent->children)
+ return NULL;
return list_entry(p->sibling.prev,struct task_struct,sibling);
}
static inline struct task_struct *younger_sibling(struct task_struct *p)
{
- if (p->sibling.next==&p->parent->children) return NULL;
+ if (p->sibling.next==&p->parent->children)
+ return NULL;
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t *p)
+static const char stat_nam[] = "RSDTtZX";
+
+static void show_task(struct task_struct *p)
{
- task_t *relative;
- unsigned state;
+ struct task_struct *relative;
unsigned long free = 0;
- static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+ unsigned state;
- printk("%-13.13s ", p->comm);
state = p->state ? __ffs(p->state) + 1 : 0;
- if (state < ARRAY_SIZE(stat_nam))
- printk(stat_nam[state]);
- else
- printk("?");
+ printk("%-13.13s %c", p->comm,
+ state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if (BITS_PER_LONG == 32)
if (state == TASK_RUNNING)
printk(" running ");
@@ -4668,7 +4742,7 @@ static void show_task(task_t *p)
void show_state(void)
{
- task_t *g, *p;
+ struct task_struct *g, *p;
#if (BITS_PER_LONG == 32)
printk("\n"
@@ -4690,7 +4764,7 @@ void show_state(void)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- mutex_debug_show_all_locks();
+ debug_show_all_locks();
}
/**
@@ -4701,9 +4775,9 @@ void show_state(void)
* NOTE: this function does not set the idle thread's NEED_RESCHED
* flag, to make booting more robust.
*/
-void __devinit init_idle(task_t *idle, int cpu)
+void __devinit init_idle(struct task_struct *idle, int cpu)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
unsigned long flags;
idle->timestamp = sched_clock();
@@ -4742,7 +4816,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
/*
* This is how migration works:
*
- * 1) we queue a migration_req_t structure in the source CPU's
+ * 1) we queue a struct migration_req structure in the source CPU's
* runqueue and wake up that CPU's migration thread.
* 2) we down() the locked semaphore => thread blocks.
* 3) migration thread wakes up (implicitly it forces the migrated
@@ -4764,12 +4838,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
+ struct migration_req req;
unsigned long flags;
+ struct rq *rq;
int ret = 0;
- migration_req_t req;
- runqueue_t *rq;
rq = task_rq_lock(p, &flags);
if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -4792,9 +4866,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
}
out:
task_rq_unlock(rq, &flags);
+
return ret;
}
-
EXPORT_SYMBOL_GPL(set_cpus_allowed);
/*
@@ -4810,7 +4884,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
*/
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
- runqueue_t *rq_dest, *rq_src;
+ struct rq *rq_dest, *rq_src;
int ret = 0;
if (unlikely(cpu_is_offline(dest_cpu)))
@@ -4838,7 +4912,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
p->timestamp = p->timestamp - rq_src->timestamp_last_tick
+ rq_dest->timestamp_last_tick;
deactivate_task(p, rq_src);
- activate_task(p, rq_dest, 0);
+ __activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr);
}
@@ -4855,16 +4929,16 @@ out:
*/
static int migration_thread(void *data)
{
- runqueue_t *rq;
int cpu = (long)data;
+ struct rq *rq;
rq = cpu_rq(cpu);
BUG_ON(rq->migration_thread != current);
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
+ struct migration_req *req;
struct list_head *head;
- migration_req_t *req;
try_to_freeze();
@@ -4888,7 +4962,7 @@ static int migration_thread(void *data)
set_current_state(TASK_INTERRUPTIBLE);
continue;
}
- req = list_entry(head->next, migration_req_t, list);
+ req = list_entry(head->next, struct migration_req, list);
list_del_init(head->next);
spin_unlock(&rq->lock);
@@ -4913,28 +4987,28 @@ wait_to_die:
#ifdef CONFIG_HOTPLUG_CPU
/* Figure out where task on dead CPU should go, use force if neccessary. */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
+static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- runqueue_t *rq;
unsigned long flags;
- int dest_cpu;
cpumask_t mask;
+ struct rq *rq;
+ int dest_cpu;
restart:
/* On same node? */
mask = node_to_cpumask(cpu_to_node(dead_cpu));
- cpus_and(mask, mask, tsk->cpus_allowed);
+ cpus_and(mask, mask, p->cpus_allowed);
dest_cpu = any_online_cpu(mask);
/* On any allowed CPU? */
if (dest_cpu == NR_CPUS)
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ dest_cpu = any_online_cpu(p->cpus_allowed);
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
- rq = task_rq_lock(tsk, &flags);
- cpus_setall(tsk->cpus_allowed);
- dest_cpu = any_online_cpu(tsk->cpus_allowed);
+ rq = task_rq_lock(p, &flags);
+ cpus_setall(p->cpus_allowed);
+ dest_cpu = any_online_cpu(p->cpus_allowed);
task_rq_unlock(rq, &flags);
/*
@@ -4942,12 +5016,12 @@ restart:
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
- if (tsk->mm && printk_ratelimit())
+ if (p->mm && printk_ratelimit())
printk(KERN_INFO "process %d (%s) no "
"longer affine to cpu%d\n",
- tsk->pid, tsk->comm, dead_cpu);
+ p->pid, p->comm, dead_cpu);
}
- if (!__migrate_task(tsk, dead_cpu, dest_cpu))
+ if (!__migrate_task(p, dead_cpu, dest_cpu))
goto restart;
}
@@ -4958,9 +5032,9 @@ restart:
* their home CPUs. So we just add the counter to another CPU's counter,
* to keep the global sum constant after CPU-down:
*/
-static void migrate_nr_uninterruptible(runqueue_t *rq_src)
+static void migrate_nr_uninterruptible(struct rq *rq_src)
{
- runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
+ struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
unsigned long flags;
local_irq_save(flags);
@@ -4974,48 +5048,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src)
/* Run through task list and migrate tasks from the dead cpu. */
static void migrate_live_tasks(int src_cpu)
{
- struct task_struct *tsk, *t;
+ struct task_struct *p, *t;
write_lock_irq(&tasklist_lock);
- do_each_thread(t, tsk) {
- if (tsk == current)
+ do_each_thread(t, p) {
+ if (p == current)
continue;
- if (task_cpu(tsk) == src_cpu)
- move_task_off_dead_cpu(src_cpu, tsk);
- } while_each_thread(t, tsk);
+ if (task_cpu(p) == src_cpu)
+ move_task_off_dead_cpu(src_cpu, p);
+ } while_each_thread(t, p);
write_unlock_irq(&tasklist_lock);
}
/* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible and adding it to
- * the _front_ of runqueue. Used by CPU offline code.
+ * the _front_ of the runqueue. Used by CPU offline code.
*/
void sched_idle_next(void)
{
- int cpu = smp_processor_id();
- runqueue_t *rq = this_rq();
+ int this_cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(this_cpu);
struct task_struct *p = rq->idle;
unsigned long flags;
/* cpu has to be offline */
- BUG_ON(cpu_online(cpu));
+ BUG_ON(cpu_online(this_cpu));
- /* Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on current cpu.
+ /*
+ * Strictly not necessary since rest of the CPUs are stopped by now
+ * and interrupts disabled on the current cpu.
*/
spin_lock_irqsave(&rq->lock, flags);
__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
- /* Add idle task to _front_ of it's priority queue */
+
+ /* Add idle task to the _front_ of its priority queue: */
__activate_idle_task(p, rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
-/* Ensures that the idle task is using init_mm right before its cpu goes
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
* offline.
*/
void idle_task_exit(void)
@@ -5029,17 +5106,17 @@ void idle_task_exit(void)
mmdrop(mm);
}
-static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
+static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
{
- struct runqueue *rq = cpu_rq(dead_cpu);
+ struct rq *rq = cpu_rq(dead_cpu);
/* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD);
+ BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
/* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(tsk->flags & PF_DEAD);
+ BUG_ON(p->flags & PF_DEAD);
- get_task_struct(tsk);
+ get_task_struct(p);
/*
* Drop lock around migration; if someone else moves it,
@@ -5047,25 +5124,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
* fine.
*/
spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, tsk);
+ move_task_off_dead_cpu(dead_cpu, p);
spin_lock_irq(&rq->lock);
- put_task_struct(tsk);
+ put_task_struct(p);
}
/* release_task() removes task from tasklist, so we won't find dead tasks. */
static void migrate_dead_tasks(unsigned int dead_cpu)
{
- unsigned arr, i;
- struct runqueue *rq = cpu_rq(dead_cpu);
+ struct rq *rq = cpu_rq(dead_cpu);
+ unsigned int arr, i;
for (arr = 0; arr < 2; arr++) {
for (i = 0; i < MAX_PRIO; i++) {
struct list_head *list = &rq->arrays[arr].queue[i];
+
while (!list_empty(list))
- migrate_dead(dead_cpu,
- list_entry(list->next, task_t,
- run_list));
+ migrate_dead(dead_cpu, list_entry(list->next,
+ struct task_struct, run_list));
}
}
}
@@ -5075,14 +5152,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
*/
-static int __cpuinit migration_call(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int __cpuinit
+migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
- int cpu = (long)hcpu;
struct task_struct *p;
- struct runqueue *rq;
+ int cpu = (long)hcpu;
unsigned long flags;
+ struct rq *rq;
switch (action) {
case CPU_UP_PREPARE:
@@ -5097,10 +5173,12 @@ static int __cpuinit migration_call(struct notifier_block *nfb,
task_rq_unlock(rq, &flags);
cpu_rq(cpu)->migration_thread = p;
break;
+
case CPU_ONLINE:
/* Strictly unneccessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
break;
+
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
if (!cpu_rq(cpu)->migration_thread)
@@ -5111,6 +5189,7 @@ static int __cpuinit migration_call(struct notifier_block *nfb,
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
+
case CPU_DEAD:
migrate_live_tasks(cpu);
rq = cpu_rq(cpu);
@@ -5131,9 +5210,10 @@ static int __cpuinit migration_call(struct notifier_block *nfb,
* the requestors. */
spin_lock_irq(&rq->lock);
while (!list_empty(&rq->migration_queue)) {
- migration_req_t *req;
+ struct migration_req *req;
+
req = list_entry(rq->migration_queue.next,
- migration_req_t, list);
+ struct migration_req, list);
list_del_init(&req->list);
complete(&req->done);
}
@@ -5155,10 +5235,12 @@ static struct notifier_block __cpuinitdata migration_notifier = {
int __init migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
- /* Start one for boot CPU. */
+
+ /* Start one for the boot CPU: */
migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
migration_call(&migration_notifier, CPU_ONLINE, cpu);
register_cpu_notifier(&migration_notifier);
+
return 0;
}
#endif
@@ -5254,7 +5336,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
} while (sd);
}
#else
-#define sched_domain_debug(sd, cpu) {}
+# define sched_domain_debug(sd, cpu) do { } while (0)
#endif
static int sd_degenerate(struct sched_domain *sd)
@@ -5280,8 +5362,8 @@ static int sd_degenerate(struct sched_domain *sd)
return 1;
}
-static int sd_parent_degenerate(struct sched_domain *sd,
- struct sched_domain *parent)
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
{
unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -5314,7 +5396,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
*/
static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
- runqueue_t *rq = cpu_rq(cpu);
+ struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
/* Remove the sched domains which do not contribute to scheduling. */
@@ -5576,8 +5658,8 @@ static void touch_cache(void *__cache, unsigned long __size)
/*
* Measure the cache-cost of one task migration. Returns in units of nsec.
*/
-static unsigned long long measure_one(void *cache, unsigned long size,
- int source, int target)
+static unsigned long long
+measure_one(void *cache, unsigned long size, int source, int target)
{
cpumask_t mask, saved_mask;
unsigned long long t0, t1, t2, t3, cost;
@@ -5729,7 +5811,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
cache = vmalloc(max_size);
if (!cache) {
printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
- return 1000000; // return 1 msec on very small boxen
+ return 1000000; /* return 1 msec on very small boxen */
}
while (size <= max_size) {
@@ -5927,9 +6009,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
*/
static cpumask_t sched_domain_node_span(int node)
{
- int i;
- cpumask_t span, nodemask;
DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+ cpumask_t span, nodemask;
+ int i;
cpus_clear(span);
bitmap_zero(used_nodes, MAX_NUMNODES);
@@ -5940,6 +6022,7 @@ static cpumask_t sched_domain_node_span(int node)
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, used_nodes);
+
nodemask = node_to_cpumask(next_node);
cpus_or(span, span, nodemask);
}
@@ -5949,19 +6032,23 @@ static cpumask_t sched_domain_node_span(int node)
#endif
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+
/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
+ * SMT sched-domains:
*/
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
+
static int cpu_to_cpu_group(int cpu)
{
return cpu;
}
#endif
+/*
+ * multi-core sched-domains:
+ */
#ifdef CONFIG_SCHED_MC
static DEFINE_PER_CPU(struct sched_domain, core_domains);
static struct sched_group *sched_group_core_bycpu[NR_CPUS];
@@ -5981,9 +6068,10 @@ static int cpu_to_core_group(int cpu)
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+
static int cpu_to_phys_group(int cpu)
{
-#if defined(CONFIG_SCHED_MC)
+#ifdef CONFIG_SCHED_MC
cpumask_t mask = cpu_coregroup_map(cpu);
return first_cpu(mask);
#elif defined(CONFIG_SCHED_SMT)
@@ -6404,7 +6492,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
for (i = 0; i < MAX_NUMNODES; i++)
init_numa_sched_groups_power(sched_group_nodes[i]);
- init_numa_sched_groups_power(sched_group_allnodes);
+ if (sched_group_allnodes) {
+ int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+ struct sched_group *sg = &sched_group_allnodes[group];
+
+ init_numa_sched_groups_power(sg);
+ }
#endif
/* Attach the domains */
@@ -6529,6 +6622,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
{
int err = 0;
+
#ifdef CONFIG_SCHED_SMT
if (smt_capable())
err = sysfs_create_file(&cls->kset.kobj,
@@ -6548,7 +6642,8 @@ static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
{
return sprintf(page, "%u\n", sched_mc_power_savings);
}
-static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
}
@@ -6561,7 +6656,8 @@ static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
{
return sprintf(page, "%u\n", sched_smt_power_savings);
}
-static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
+ const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
}
@@ -6623,6 +6719,7 @@ int in_sched_functions(unsigned long addr)
{
/* Linker adds these: start and end of __sched functions */
extern char __sched_text_start[], __sched_text_end[];
+
return in_lock_functions(addr) ||
(addr >= (unsigned long)__sched_text_start
&& addr < (unsigned long)__sched_text_end);
@@ -6630,14 +6727,15 @@ int in_sched_functions(unsigned long addr)
void __init sched_init(void)
{
- runqueue_t *rq;
int i, j, k;
for_each_possible_cpu(i) {
- prio_array_t *array;
+ struct prio_array *array;
+ struct rq *rq;
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+ lockdep_set_class(&rq->lock, &rq->rq_lock_key);
rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
@@ -6666,6 +6764,11 @@ void __init sched_init(void)
}
set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+ plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
/*
* The boot idle thread does lazy MMU switching as well:
*/
@@ -6684,7 +6787,7 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
void __might_sleep(char *file, int line)
{
-#if defined(in_atomic)
+#ifdef in_atomic
static unsigned long prev_jiffy; /* ratelimiting */
if ((in_atomic() || irqs_disabled()) &&
@@ -6706,10 +6809,10 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)
{
+ struct prio_array *array;
struct task_struct *p;
- prio_array_t *array;
unsigned long flags;
- runqueue_t *rq;
+ struct rq *rq;
read_lock_irq(&tasklist_lock);
for_each_process(p) {
@@ -6753,7 +6856,7 @@ void normalize_rt_tasks(void)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-task_t *curr_task(int cpu)
+struct task_struct *curr_task(int cpu)
{
return cpu_curr(cpu);
}
@@ -6773,7 +6876,7 @@ task_t *curr_task(int cpu)
*
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
*/
-void set_curr_task(int cpu, task_t *p)
+void set_curr_task(int cpu, struct task_struct *p)
{
cpu_curr(cpu) = p;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 7fe874d12fa..bfdb5686fa3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -791,22 +791,31 @@ out:
/*
* Force a signal that the process can't ignore: if necessary
* we unblock the signal and change any SIG_IGN to SIG_DFL.
+ *
+ * Note: If we unblock the signal, we always reset it to SIG_DFL,
+ * since we do not want to have a signal handler that was blocked
+ * be invoked when user space had explicitly blocked it.
+ *
+ * We don't want to have recursive SIGSEGV's etc, for example.
*/
-
int
force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
{
unsigned long int flags;
- int ret;
+ int ret, blocked, ignored;
+ struct k_sigaction *action;
spin_lock_irqsave(&t->sighand->siglock, flags);
- if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
- t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
- }
- if (sigismember(&t->blocked, sig)) {
- sigdelset(&t->blocked, sig);
+ action = &t->sighand->action[sig-1];
+ ignored = action->sa.sa_handler == SIG_IGN;
+ blocked = sigismember(&t->blocked, sig);
+ if (blocked || ignored) {
+ action->sa.sa_handler = SIG_DFL;
+ if (blocked) {
+ sigdelset(&t->blocked, sig);
+ recalc_sigpending_tsk(t);
+ }
}
- recalc_sigpending_tsk(t);
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8f03e3b89b5..3789ca98197 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,137 @@ static inline void wakeup_softirqd(void)
}
/*
+ * This one is for softirq.c-internal use,
+ * where hardirqs are disabled legitimately:
+ */
+#ifdef CONFIG_TRACE_IRQFLAGS
+static void __local_bh_disable(unsigned long ip)
+{
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ raw_local_irq_save(flags);
+ add_preempt_count(SOFTIRQ_OFFSET);
+ /*
+ * Were softirqs turned off above:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_off(ip);
+ raw_local_irq_restore(flags);
+}
+#else /* !CONFIG_TRACE_IRQFLAGS */
+static inline void __local_bh_disable(unsigned long ip)
+{
+ add_preempt_count(SOFTIRQ_OFFSET);
+ barrier();
+}
+#endif /* CONFIG_TRACE_IRQFLAGS */
+
+void local_bh_disable(void)
+{
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+}
+
+EXPORT_SYMBOL(local_bh_disable);
+
+void __local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+
+ /*
+ * softirqs should never be enabled by __local_bh_enable(),
+ * it always nests inside local_bh_enable() sections:
+ */
+ WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
+
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+EXPORT_SYMBOL_GPL(__local_bh_enable);
+
+/*
+ * Special-case - softirqs can safely be enabled in
+ * cond_resched_softirq(), or by __do_softirq(),
+ * without processing still-pending softirqs:
+ */
+void _local_bh_enable(void)
+{
+ WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ sub_preempt_count(SOFTIRQ_OFFSET);
+}
+
+EXPORT_SYMBOL(_local_bh_enable);
+
+void local_bh_enable(void)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+#endif
+ WARN_ON_ONCE(irqs_disabled());
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_save(flags);
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
+ do_softirq();
+
+ dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_restore(flags);
+#endif
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable);
+
+void local_bh_enable_ip(unsigned long ip)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+ unsigned long flags;
+
+ WARN_ON_ONCE(in_irq());
+
+ local_irq_save(flags);
+#endif
+ /*
+ * Are softirqs going to be turned on now:
+ */
+ if (softirq_count() == SOFTIRQ_OFFSET)
+ trace_softirqs_on(ip);
+ /*
+ * Keep preemption disabled until we are done with
+ * softirq processing:
+ */
+ sub_preempt_count(SOFTIRQ_OFFSET - 1);
+
+ if (unlikely(!in_interrupt() && local_softirq_pending()))
+ do_softirq();
+
+ dec_preempt_count();
+#ifdef CONFIG_TRACE_IRQFLAGS
+ local_irq_restore(flags);
+#endif
+ preempt_check_resched();
+}
+EXPORT_SYMBOL(local_bh_enable_ip);
+
+/*
* We restart softirq processing MAX_SOFTIRQ_RESTART times,
* and we fall back to softirqd after that.
*
@@ -80,8 +211,11 @@ asmlinkage void __do_softirq(void)
int cpu;
pending = local_softirq_pending();
+ account_system_vtime(current);
+
+ __local_bh_disable((unsigned long)__builtin_return_address(0));
+ trace_softirq_enter();
- local_bh_disable();
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
@@ -109,7 +243,10 @@ restart:
if (pending)
wakeup_softirqd();
- __local_bh_enable();
+ trace_softirq_exit();
+
+ account_system_vtime(current);
+ _local_bh_enable();
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -136,23 +273,6 @@ EXPORT_SYMBOL(do_softirq);
#endif
-void local_bh_enable(void)
-{
- WARN_ON(irqs_disabled());
- /*
- * Keep preemption disabled until we are done with
- * softirq processing:
- */
- sub_preempt_count(SOFTIRQ_OFFSET - 1);
-
- if (unlikely(!in_interrupt() && local_softirq_pending()))
- do_softirq();
-
- dec_preempt_count();
- preempt_check_resched();
-}
-EXPORT_SYMBOL(local_bh_enable);
-
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
# define invoke_softirq() __do_softirq()
#else
@@ -165,6 +285,7 @@ EXPORT_SYMBOL(local_bh_enable);
void irq_exit(void)
{
account_system_vtime(current);
+ trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -208,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
softirq_vec[nr].action = action;
}
-EXPORT_SYMBOL(open_softirq);
-
/* Tasklets */
struct tasklet_head
{
@@ -446,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
@@ -486,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa2298..03e6a2b0b78 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
/*
* Create/destroy watchdog threads as CPUs come and go:
*/
-static int __devinit
+static int __cpuinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index b31e54eadf5..fb524b009ee 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -13,6 +13,7 @@
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
+#include <linux/debug_locks.h>
#include <linux/module.h>
/*
@@ -29,8 +30,10 @@ EXPORT_SYMBOL(generic__raw_read_trylock);
int __lockfunc _spin_trylock(spinlock_t *lock)
{
preempt_disable();
- if (_raw_spin_trylock(lock))
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
@@ -40,8 +43,10 @@ EXPORT_SYMBOL(_spin_trylock);
int __lockfunc _read_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_read_trylock(lock))
+ if (_raw_read_trylock(lock)) {
+ rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
@@ -51,19 +56,28 @@ EXPORT_SYMBOL(_read_trylock);
int __lockfunc _write_trylock(rwlock_t *lock)
{
preempt_disable();
- if (_raw_write_trylock(lock))
+ if (_raw_write_trylock(lock)) {
+ rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable();
return 0;
}
EXPORT_SYMBOL(_write_trylock);
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+/*
+ * If lockdep is enabled then we use the non-preemption spin-ops
+ * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
+ * not re-enabled during lock-acquire (which the preempt-spin-ops do):
+ */
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
+ defined(CONFIG_DEBUG_LOCK_ALLOC)
void __lockfunc _read_lock(rwlock_t *lock)
{
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock);
@@ -74,7 +88,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+ /*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_spin_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#ifdef CONFIG_PROVE_LOCKING
+ _raw_spin_lock(lock);
+#else
_raw_spin_lock_flags(lock, &flags);
+#endif
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave);
@@ -83,6 +107,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_irq);
@@ -91,6 +116,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
EXPORT_SYMBOL(_spin_lock_bh);
@@ -101,6 +127,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
return flags;
}
@@ -110,6 +137,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_irq);
@@ -118,6 +146,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
_raw_read_lock(lock);
}
EXPORT_SYMBOL(_read_lock_bh);
@@ -128,6 +157,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
return flags;
}
@@ -137,6 +167,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
{
local_irq_disable();
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_irq);
@@ -145,6 +176,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
{
local_bh_disable();
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
EXPORT_SYMBOL(_write_lock_bh);
@@ -152,6 +184,7 @@ EXPORT_SYMBOL(_write_lock_bh);
void __lockfunc _spin_lock(spinlock_t *lock)
{
preempt_disable();
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
}
@@ -160,6 +193,7 @@ EXPORT_SYMBOL(_spin_lock);
void __lockfunc _write_lock(rwlock_t *lock)
{
preempt_disable();
+ rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_write_lock(lock);
}
@@ -255,8 +289,22 @@ BUILD_LOCK_OPS(write, rwlock);
#endif /* CONFIG_PREEMPT */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
+{
+ preempt_disable();
+ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ _raw_spin_lock(lock);
+}
+
+EXPORT_SYMBOL(_spin_lock_nested);
+
+#endif
+
void __lockfunc _spin_unlock(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
preempt_enable();
}
@@ -264,6 +312,7 @@ EXPORT_SYMBOL(_spin_unlock);
void __lockfunc _write_unlock(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
preempt_enable();
}
@@ -271,6 +320,7 @@ EXPORT_SYMBOL(_write_unlock);
void __lockfunc _read_unlock(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
preempt_enable();
}
@@ -278,6 +328,7 @@ EXPORT_SYMBOL(_read_unlock);
void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -286,6 +337,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore);
void __lockfunc _spin_unlock_irq(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -294,14 +346,16 @@ EXPORT_SYMBOL(_spin_unlock_irq);
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
+ spin_release(&lock->dep_map, 1, _RET_IP_);
_raw_spin_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_spin_unlock_bh);
void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -310,6 +364,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore);
void __lockfunc _read_unlock_irq(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -318,14 +373,16 @@ EXPORT_SYMBOL(_read_unlock_irq);
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_read_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_read_unlock_bh);
void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
local_irq_restore(flags);
preempt_enable();
@@ -334,6 +391,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore);
void __lockfunc _write_unlock_irq(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
local_irq_enable();
preempt_enable();
@@ -342,9 +400,10 @@ EXPORT_SYMBOL(_write_unlock_irq);
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
+ rwlock_release(&lock->dep_map, 1, _RET_IP_);
_raw_write_unlock(lock);
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
}
EXPORT_SYMBOL(_write_unlock_bh);
@@ -352,11 +411,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
{
local_bh_disable();
preempt_disable();
- if (_raw_spin_trylock(lock))
+ if (_raw_spin_trylock(lock)) {
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
return 1;
+ }
preempt_enable_no_resched();
- local_bh_enable();
+ local_bh_enable_ip((unsigned long)__builtin_return_address(0));
return 0;
}
EXPORT_SYMBOL(_spin_trylock_bh);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 00000000000..b71816e47a3
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
+/*
+ * kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
+
+void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+ int i, j;
+
+ for (i = 0; i < trace->nr_entries; i++) {
+ unsigned long ip = trace->entries[i];
+
+ for (j = 0; j < spaces + 1; j++)
+ printk(" ");
+ print_ip_sym(ip);
+ }
+}
+
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2c0aacc37c5..51cacd111db 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,7 +4,6 @@
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/syscalls.h>
-#include <linux/kthread.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
#include <asm/uaccess.h>
@@ -26,11 +25,13 @@ static unsigned int stopmachine_num_threads;
static atomic_t stopmachine_thread_ack;
static DECLARE_MUTEX(stopmachine_mutex);
-static int stopmachine(void *unused)
+static int stopmachine(void *cpu)
{
int irqs_disabled = 0;
int prepared = 0;
+ set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
+
/* Ack: we are alive */
smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
atomic_inc(&stopmachine_thread_ack);
@@ -84,8 +85,7 @@ static void stopmachine_set_state(enum stopmachine_state state)
static int stop_machine(void)
{
- int ret = 0;
- unsigned int i;
+ int i, ret = 0;
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
/* One high-prio thread per cpu. We'll do this one. */
@@ -96,16 +96,11 @@ static int stop_machine(void)
stopmachine_state = STOPMACHINE_WAIT;
for_each_online_cpu(i) {
- struct task_struct *tsk;
if (i == raw_smp_processor_id())
continue;
- tsk = kthread_create(stopmachine, NULL, "stopmachine");
- if (IS_ERR(tsk)) {
- ret = PTR_ERR(tsk);
+ ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
+ if (ret < 0)
break;
- }
- kthread_bind(tsk, i);
- wake_up_process(tsk);
stopmachine_num_threads++;
}
@@ -116,7 +111,6 @@ static int stop_machine(void)
/* If some failed, kill them all. */
if (ret < 0) {
stopmachine_set_state(STOPMACHINE_EXIT);
- up(&stopmachine_mutex);
return ret;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index dbb3b9c7ea6..e236f98f7ec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
error = current->mm->dumpable;
break;
case PR_SET_DUMPABLE:
- if (arg2 < 0 || arg2 > 2) {
+ if (arg2 < 0 || arg2 > 1) {
error = -EINVAL;
break;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99a58f27907..362a0cc3713 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,6 +932,17 @@ static ctl_table vm_table[] = {
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
+ {
+ .ctl_name = VM_MIN_UNMAPPED,
+ .procname = "min_unmapped_ratio",
+ .data = &sysctl_min_unmapped_ratio,
+ .maxlen = sizeof(sysctl_min_unmapped_ratio),
+ .mode = 0644,
+ .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
+ .strategy = &sysctl_intvec,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
#endif
#ifdef CONFIG_X86_32
{
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 00000000000..e7818765733
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,564 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ * (C) Balbir Singh, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+
+static struct genl_family family = {
+ .id = GENL_ID_GENERATE,
+ .name = TASKSTATS_GENL_NAME,
+ .version = TASKSTATS_GENL_VERSION,
+ .maxattr = TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+ [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+ [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+ [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+ struct list_head list;
+ pid_t pid;
+ char valid;
+};
+
+struct listener_list {
+ struct rw_semaphore sem;
+ struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+ REGISTER,
+ DEREGISTER,
+ CPU_DONT_CARE
+};
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+ void **replyp, size_t size)
+{
+ struct sk_buff *skb;
+ void *reply;
+
+ /*
+ * If new attributes are added, please revisit this allocation
+ */
+ skb = nlmsg_new(size);
+ if (!skb)
+ return -ENOMEM;
+
+ if (!info) {
+ int seq = get_cpu_var(taskstats_seqnum)++;
+ put_cpu_var(taskstats_seqnum);
+
+ reply = genlmsg_put(skb, 0, seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ } else
+ reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+ family.id, 0, 0,
+ cmd, family.version);
+ if (reply == NULL) {
+ nlmsg_free(skb);
+ return -EINVAL;
+ }
+
+ *skbp = skb;
+ *replyp = reply;
+ return 0;
+}
+
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ void *reply = genlmsg_data(genlhdr);
+ int rc;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return rc;
+ }
+
+ return genlmsg_unicast(skb, pid);
+}
+
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+ struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct listener_list *listeners;
+ struct listener *s, *tmp;
+ struct sk_buff *skb_next, *skb_cur = skb;
+ void *reply = genlmsg_data(genlhdr);
+ int rc, delcount = 0;
+
+ rc = genlmsg_end(skb, reply);
+ if (rc < 0) {
+ nlmsg_free(skb);
+ return;
+ }
+
+ rc = 0;
+ listeners = &per_cpu(listener_array, cpu);
+ down_read(&listeners->sem);
+ list_for_each_entry(s, &listeners->list, list) {
+ skb_next = NULL;
+ if (!list_is_last(&s->list, &listeners->list)) {
+ skb_next = skb_clone(skb_cur, GFP_KERNEL);
+ if (!skb_next)
+ break;
+ }
+ rc = genlmsg_unicast(skb_cur, s->pid);
+ if (rc == -ECONNREFUSED) {
+ s->valid = 0;
+ delcount++;
+ }
+ skb_cur = skb_next;
+ }
+ up_read(&listeners->sem);
+
+ if (skb_cur)
+ nlmsg_free(skb_cur);
+
+ if (!delcount)
+ return;
+
+ /* Delete invalidated entries */
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (!s->valid) {
+ list_del(&s->list);
+ kfree(s);
+ }
+ }
+ up_write(&listeners->sem);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+ struct taskstats *stats)
+{
+ int rc = 0;
+ struct task_struct *tsk = pidtsk;
+
+ if (!pidtsk) {
+ read_lock(&tasklist_lock);
+ tsk = find_task_by_pid(pid);
+ if (!tsk) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(tsk);
+
+ /*
+ * Each accounting subsystem adds calls to its functions to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+
+ delayacct_add_tsk(stats, tsk);
+ stats->version = TASKSTATS_VERSION;
+
+ /* Define err: label here if needed */
+ put_task_struct(tsk);
+ return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+ struct taskstats *stats)
+{
+ struct task_struct *tsk, *first;
+ unsigned long flags;
+
+ /*
+ * Add additional stats from live tasks except zombie thread group
+ * leaders who are already counted with the dead tasks
+ */
+ first = tgidtsk;
+ if (!first) {
+ read_lock(&tasklist_lock);
+ first = find_task_by_pid(tgid);
+ if (!first) {
+ read_unlock(&tasklist_lock);
+ return -ESRCH;
+ }
+ get_task_struct(first);
+ read_unlock(&tasklist_lock);
+ } else
+ get_task_struct(first);
+
+ /* Start with stats from dead tasks */
+ spin_lock_irqsave(&first->signal->stats_lock, flags);
+ if (first->signal->stats)
+ memcpy(stats, first->signal->stats, sizeof(*stats));
+ spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
+ tsk = first;
+ read_lock(&tasklist_lock);
+ do {
+ if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+ continue;
+ /*
+ * Accounting subsystem can call its functions here to
+ * fill in relevant parts of struct taskstsats as follows
+ *
+ * per-task-foo(stats, tsk);
+ */
+ delayacct_add_tsk(stats, tsk);
+
+ } while_each_thread(first, tsk);
+ read_unlock(&tasklist_lock);
+ stats->version = TASKSTATS_VERSION;
+
+ /*
+ * Accounting subsytems can also add calls here to modify
+ * fields of taskstats.
+ */
+
+ return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+ if (!tsk->signal->stats)
+ goto ret;
+
+ /*
+ * Each accounting subsystem calls its functions here to
+ * accumalate its per-task stats for tsk, into the per-tgid structure
+ *
+ * per-task-foo(tsk->signal->stats, tsk);
+ */
+ delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+ spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+ return;
+}
+
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+ struct listener_list *listeners;
+ struct listener *s, *tmp;
+ unsigned int cpu;
+ cpumask_t mask = *maskp;
+
+ if (!cpus_subset(mask, cpu_possible_map))
+ return -EINVAL;
+
+ if (isadd == REGISTER) {
+ for_each_cpu_mask(cpu, mask) {
+ s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!s)
+ goto cleanup;
+ s->pid = pid;
+ INIT_LIST_HEAD(&s->list);
+ s->valid = 1;
+
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_add(&s->list, &listeners->list);
+ up_write(&listeners->sem);
+ }
+ return 0;
+ }
+
+ /* Deregister or cleanup */
+cleanup:
+ for_each_cpu_mask(cpu, mask) {
+ listeners = &per_cpu(listener_array, cpu);
+ down_write(&listeners->sem);
+ list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+ if (s->pid == pid) {
+ list_del(&s->list);
+ kfree(s);
+ break;
+ }
+ }
+ up_write(&listeners->sem);
+ }
+ return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+ char *data;
+ int len;
+ int ret;
+
+ if (na == NULL)
+ return 1;
+ len = nla_len(na);
+ if (len > TASKSTATS_CPUMASK_MAXLEN)
+ return -E2BIG;
+ if (len < 1)
+ return -EINVAL;
+ data = kmalloc(len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ nla_strlcpy(data, na, len);
+ ret = cpulist_parse(data, *mask);
+ kfree(data);
+ return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ int rc = 0;
+ struct sk_buff *rep_skb;
+ struct taskstats stats;
+ void *reply;
+ size_t size;
+ struct nlattr *na;
+ cpumask_t mask;
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+ rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+ if (rc < 0)
+ return rc;
+ if (rc == 0)
+ return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ memset(&stats, 0, sizeof(stats));
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ return rc;
+
+ if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+ u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ rc = fill_pid(pid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+ u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ rc = fill_tgid(tgid, NULL, &stats);
+ if (rc < 0)
+ goto err;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ stats);
+ } else {
+ rc = -EINVAL;
+ goto err;
+ }
+
+ nla_nest_end(rep_skb, na);
+
+ return send_reply(rep_skb, info->snd_pid);
+
+nla_put_failure:
+ return genlmsg_cancel(rep_skb, reply);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+ struct listener_list *listeners;
+ struct taskstats *tmp;
+ /*
+ * This is the cpu on which the task is exiting currently and will
+ * be the one for which the exit event is sent, even if the cpu
+ * on which this function is running changes later.
+ */
+ *mycpu = raw_smp_processor_id();
+
+ *ptidstats = NULL;
+ tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+ if (!tmp)
+ return;
+
+ listeners = &per_cpu(listener_array, *mycpu);
+ down_read(&listeners->sem);
+ if (!list_empty(&listeners->list)) {
+ *ptidstats = tmp;
+ tmp = NULL;
+ }
+ up_read(&listeners->sem);
+ kfree(tmp);
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+ int group_dead, unsigned int mycpu)
+{
+ int rc;
+ struct sk_buff *rep_skb;
+ void *reply;
+ size_t size;
+ int is_thread_group;
+ struct nlattr *na;
+ unsigned long flags;
+
+ if (!family_registered || !tidstats)
+ return;
+
+ spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+ is_thread_group = tsk->signal->stats ? 1 : 0;
+ spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+
+ rc = 0;
+ /*
+ * Size includes space for nested attributes
+ */
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ if (is_thread_group)
+ size = 2 * size; /* PID + STATS + TGID + STATS */
+
+ rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+ if (rc < 0)
+ goto ret;
+
+ rc = fill_pid(tsk->pid, tsk, tidstats);
+ if (rc < 0)
+ goto err_skb;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tidstats);
+ nla_nest_end(rep_skb, na);
+
+ if (!is_thread_group)
+ goto send;
+
+ /*
+ * tsk has/had a thread group so fill the tsk->signal->stats structure
+ * Doesn't matter if tsk is the leader or the last group member leaving
+ */
+
+ fill_tgid_exit(tsk);
+ if (!group_dead)
+ goto send;
+
+ na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+ NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+ /* No locking needed for tsk->signal->stats since group is dead */
+ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+ *tsk->signal->stats);
+ nla_nest_end(rep_skb, na);
+
+send:
+ send_cpu_listeners(rep_skb, mycpu);
+ return;
+
+nla_put_failure:
+ genlmsg_cancel(rep_skb, reply);
+ goto ret;
+err_skb:
+ nlmsg_free(rep_skb);
+ret:
+ return;
+}
+
+static struct genl_ops taskstats_ops = {
+ .cmd = TASKSTATS_CMD_GET,
+ .doit = taskstats_user_cmd,
+ .policy = taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+ unsigned int i;
+
+ taskstats_cache = kmem_cache_create("taskstats_cache",
+ sizeof(struct taskstats),
+ 0, SLAB_PANIC, NULL, NULL);
+ for_each_possible_cpu(i) {
+ INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+ init_rwsem(&(per_cpu(listener_array, i).sem));
+ }
+}
+
+static int __init taskstats_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&family);
+ if (rc)
+ return rc;
+
+ rc = genl_register_ops(&family, &taskstats_ops);
+ if (rc < 0)
+ goto err;
+
+ family_registered = 1;
+ return 0;
+err:
+ genl_unregister_family(&family);
+ return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/timer.c b/kernel/timer.c
index 5a896025306..1d7dd6267c2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
-static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases };
+static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
return ret;
+ cpu_relax();
}
}
@@ -407,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
* This function cascades all vectors and executes all expired timer
* vectors.
*/
-#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
+#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
static inline void __run_timers(tvec_base_t *base)
{
@@ -891,6 +892,7 @@ int do_settimeofday(struct timespec *tv)
set_normalized_timespec(&xtime, sec, nsec);
set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+ clock->error = 0;
ntp_clear();
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -967,6 +969,7 @@ void __init timekeeping_init(void)
}
+static int timekeeping_suspended;
/*
* timekeeping_resume - Resumes the generic timekeeping subsystem.
* @dev: unused
@@ -982,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
write_seqlock_irqsave(&xtime_lock, flags);
/* restart the last cycle value */
clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_suspended = 1;
write_sequnlock_irqrestore(&xtime_lock, flags);
return 0;
}
@@ -989,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
/* sysfs resume/suspend bits for timekeeping */
static struct sysdev_class timekeeping_sysclass = {
.resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
set_kset_name("timekeeping"),
};
@@ -1008,52 +1024,52 @@ static int __init timekeeping_init_device(void)
device_initcall(timekeeping_init_device);
/*
- * If the error is already larger, we look ahead another tick,
+ * If the error is already larger, we look ahead even further
* to compensate for late or lost adjustments.
*/
-static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
{
- int adj;
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
/*
- * As soon as the machine is synchronized to the external time
- * source this should be the common case.
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
*/
- error >>= 2;
- if (likely(sign > 0 ? error <= *interval : error >= *interval))
- return sign;
+ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
/*
- * An extra look ahead dampens the effect of the current error,
- * which can grow quite large with continously late updates, as
- * it would dominate the adjustment value and can lead to
- * oscillation.
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
*/
- error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
- error -= clock->xtime_interval >> 1;
-
- adj = 0;
- while (1) {
- error >>= 1;
- if (sign > 0 ? error <= *interval : error >= *interval)
- break;
- adj++;
+ tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
}
-
- /*
- * Add the current adjustments to the error and take the offset
- * into account, the latter can cause the error to be hardly
- * reduced at the next tick. Check the error again if there's
- * room for another adjustment, thus further reducing the error
- * which otherwise had to be corrected at the next update.
- */
- error = (error << 1) - *interval + *offset;
- if (sign > 0 ? error > *interval : error < *interval)
- adj++;
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
*interval <<= adj;
*offset <<= adj;
- return sign << adj;
+ return mult << adj;
}
/*
@@ -1068,11 +1084,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
if (error > interval) {
- adj = clocksource_bigadjust(1, error, &interval, &offset);
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
- interval = -interval;
- offset = -offset;
- adj = clocksource_bigadjust(-1, error, &interval, &offset);
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
} else
return;
@@ -1091,13 +1115,16 @@ static void update_wall_time(void)
{
cycle_t offset;
- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
#ifdef CONFIG_GENERIC_TIME
offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
#else
offset = clock->cycle_interval;
#endif
+ clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
/* normally this loop will run just once, however in the
* case of lost or late ticks, it will accumulate correctly.
@@ -1129,7 +1156,7 @@ static void update_wall_time(void)
clocksource_adjust(clock, offset);
/* store full nanoseconds into xtime */
- xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
+ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
/* check to see if there is a new clocksource to use */
@@ -1208,7 +1235,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
* playing with xtime and avenrun.
*/
#ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
EXPORT_SYMBOL(xtime_lock);
#endif
@@ -1297,46 +1324,19 @@ asmlinkage long sys_getpid(void)
}
/*
- * Accessing ->group_leader->real_parent is not SMP-safe, it could
- * change from under us. However, rather than getting any lock
- * we can use an optimistic algorithm: get the parent
- * pid, and go back and check that the parent is still
- * the same. If it has changed (which is extremely unlikely
- * indeed), we just try again..
- *
- * NOTE! This depends on the fact that even if we _do_
- * get an old value of "parent", we can happily dereference
- * the pointer (it was and remains a dereferencable kernel pointer
- * no matter what): we just can't necessarily trust the result
- * until we know that the parent pointer is valid.
- *
- * NOTE2: ->group_leader never changes from under us.
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
*/
asmlinkage long sys_getppid(void)
{
int pid;
- struct task_struct *me = current;
- struct task_struct *parent;
- parent = me->group_leader->real_parent;
- for (;;) {
- pid = parent->tgid;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-{
- struct task_struct *old = parent;
+ rcu_read_lock();
+ pid = rcu_dereference(current->real_parent)->tgid;
+ rcu_read_unlock();
- /*
- * Make sure we read the pid before re-reading the
- * parent pointer:
- */
- smp_rmb();
- parent = me->group_leader->real_parent;
- if (old != parent)
- continue;
-}
-#endif
- break;
- }
return pid;
}
@@ -1368,7 +1368,7 @@ asmlinkage long sys_getegid(void)
static void process_timeout(unsigned long __data)
{
- wake_up_process((task_t *)__data);
+ wake_up_process((struct task_struct *)__data);
}
/**
@@ -1559,6 +1559,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
return 0;
}
+/*
+ * lockdep: we want to track each per-CPU base as a separate lock-class,
+ * but timer-bases are kmalloc()-ed, so we need to attach separate
+ * keys to them:
+ */
+static struct lock_class_key base_lock_keys[NR_CPUS];
+
static int __devinit init_timers_cpu(int cpu)
{
int j;
@@ -1594,6 +1601,8 @@ static int __devinit init_timers_cpu(int cpu)
}
spin_lock_init(&base->lock);
+ lockdep_set_class(&base->lock, base_lock_keys + cpu);
+
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1652,7 +1661,7 @@ static void __devinit migrate_timers(int cpu)
}
#endif /* CONFIG_HOTPLUG_CPU */
-static int __devinit timer_cpu_notify(struct notifier_block *self,
+static int __cpuinit timer_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
@@ -1672,7 +1681,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block __cpuinitdata timers_nb = {
.notifier_call = timer_cpu_notify,
};
diff --git a/kernel/wait.c b/kernel/wait.c
index 5985d866531..59a82f63275 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,6 +10,14 @@
#include <linux/wait.h>
#include <linux/hash.h>
+void init_waitqueue_head(wait_queue_head_t *q)
+{
+ spin_lock_init(&q->lock);
+ INIT_LIST_HEAD(&q->task_list);
+}
+
+EXPORT_SYMBOL(init_waitqueue_head);
+
void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 59f0b42bd89..835fe28b87a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct {
wait_queue_head_t work_done;
struct workqueue_struct *wq;
- task_t *thread;
+ struct task_struct *thread;
int run_depth; /* Detect run_workqueue() recursion depth */
} ____cacheline_aligned;
@@ -68,7 +68,7 @@ struct workqueue_struct {
/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
threads to each one as cpus come/go. */
-static DEFINE_SPINLOCK(workqueue_lock);
+static DEFINE_MUTEX(workqueue_mutex);
static LIST_HEAD(workqueues);
static int singlethread_cpu;
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
spin_unlock_irqrestore(&cwq->lock, flags);
}
-/*
- * Queue work on a workqueue. Return non-zero if it was successfully
- * added.
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns non-zero if it was successfully added.
*
* We queue the work to the CPU it was submitted, but there is no
* guarantee that it will be processed by that CPU.
@@ -114,6 +117,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
put_cpu();
return ret;
}
+EXPORT_SYMBOL_GPL(queue_work);
static void delayed_work_timer_fn(unsigned long __data)
{
@@ -127,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
}
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
int fastcall queue_delayed_work(struct workqueue_struct *wq,
struct work_struct *work, unsigned long delay)
{
@@ -147,6 +159,38 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
}
return ret;
}
+EXPORT_SYMBOL_GPL(queue_delayed_work);
+
+/**
+ * queue_delayed_work_on - queue work on specific CPU after delay
+ * @cpu: CPU number to execute work on
+ * @wq: workqueue to use
+ * @work: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Returns non-zero if it was successfully added.
+ */
+int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+ struct work_struct *work, unsigned long delay)
+{
+ int ret = 0;
+ struct timer_list *timer = &work->timer;
+
+ if (!test_and_set_bit(0, &work->pending)) {
+ BUG_ON(timer_pending(timer));
+ BUG_ON(!list_empty(&work->entry));
+
+ /* This stores wq for the moment, for the timer_fn */
+ work->wq_data = wq;
+ timer->expires = jiffies + delay;
+ timer->data = (unsigned long)work;
+ timer->function = delayed_work_timer_fn;
+ add_timer_on(timer, cpu);
+ ret = 1;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(queue_delayed_work_on);
static void run_workqueue(struct cpu_workqueue_struct *cwq)
{
@@ -251,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
}
}
-/*
+/**
* flush_workqueue - ensure that any scheduled work has run to completion.
+ * @wq: workqueue to flush
*
* Forces execution of the workqueue and blocks until its completion.
* This is typically used in driver shutdown handlers.
@@ -275,12 +320,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
} else {
int cpu;
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu)
flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
}
}
+EXPORT_SYMBOL_GPL(flush_workqueue);
static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
int cpu)
@@ -325,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
}
wq->name = name;
- /* We don't need the distraction of CPUs appearing and vanishing. */
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
if (singlethread) {
INIT_LIST_HEAD(&wq->list);
p = create_workqueue_thread(wq, singlethread_cpu);
@@ -335,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
else
wake_up_process(p);
} else {
- spin_lock(&workqueue_lock);
list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
for_each_online_cpu(cpu) {
p = create_workqueue_thread(wq, cpu);
if (p) {
@@ -347,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
destroy = 1;
}
}
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
/*
* Was there any error during startup? If yes then clean up:
@@ -358,6 +401,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
}
return wq;
}
+EXPORT_SYMBOL_GPL(__create_workqueue);
static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
{
@@ -374,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
kthread_stop(p);
}
+/**
+ * destroy_workqueue - safely terminate a workqueue
+ * @wq: target workqueue
+ *
+ * Safely destroy a workqueue. All work currently pending will be done first.
+ */
void destroy_workqueue(struct workqueue_struct *wq)
{
int cpu;
@@ -381,52 +431,63 @@ void destroy_workqueue(struct workqueue_struct *wq)
flush_workqueue(wq);
/* We don't need the distraction of CPUs appearing and vanishing. */
- lock_cpu_hotplug();
+ mutex_lock(&workqueue_mutex);
if (is_single_threaded(wq))
cleanup_workqueue_thread(wq, singlethread_cpu);
else {
for_each_online_cpu(cpu)
cleanup_workqueue_thread(wq, cpu);
- spin_lock(&workqueue_lock);
list_del(&wq->list);
- spin_unlock(&workqueue_lock);
}
- unlock_cpu_hotplug();
+ mutex_unlock(&workqueue_mutex);
free_percpu(wq->cpu_wq);
kfree(wq);
}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
static struct workqueue_struct *keventd_wq;
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * This puts a job in the kernel-global workqueue.
+ */
int fastcall schedule_work(struct work_struct *work)
{
return queue_work(keventd_wq, work);
}
+EXPORT_SYMBOL(schedule_work);
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
{
return queue_delayed_work(keventd_wq, work, delay);
}
+EXPORT_SYMBOL(schedule_delayed_work);
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @work: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
int schedule_delayed_work_on(int cpu,
struct work_struct *work, unsigned long delay)
{
- int ret = 0;
- struct timer_list *timer = &work->timer;
-
- if (!test_and_set_bit(0, &work->pending)) {
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
- /* This stores keventd_wq for the moment, for the timer_fn */
- work->wq_data = keventd_wq;
- timer->expires = jiffies + delay;
- timer->data = (unsigned long)work;
- timer->function = delayed_work_timer_fn;
- add_timer_on(timer, cpu);
- ret = 1;
- }
- return ret;
+ return queue_delayed_work_on(cpu, keventd_wq, work, delay);
}
+EXPORT_SYMBOL(schedule_delayed_work_on);
/**
* schedule_on_each_cpu - call a function on each online CPU from keventd
@@ -449,11 +510,13 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info)
if (!works)
return -ENOMEM;
+ mutex_lock(&workqueue_mutex);
for_each_online_cpu(cpu) {
INIT_WORK(per_cpu_ptr(works, cpu), func, info);
__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
per_cpu_ptr(works, cpu));
}
+ mutex_unlock(&workqueue_mutex);
flush_workqueue(keventd_wq);
free_percpu(works);
return 0;
@@ -463,6 +526,7 @@ void flush_scheduled_work(void)
{
flush_workqueue(keventd_wq);
}
+EXPORT_SYMBOL(flush_scheduled_work);
/**
* cancel_rearming_delayed_workqueue - reliably kill off a delayed
@@ -568,6 +632,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
+ mutex_lock(&workqueue_mutex);
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
if (!create_workqueue_thread(wq, hotcpu)) {
@@ -586,6 +651,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
kthread_bind(cwq->thread, hotcpu);
wake_up_process(cwq->thread);
}
+ mutex_unlock(&workqueue_mutex);
break;
case CPU_UP_CANCELED:
@@ -597,6 +663,15 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
any_online_cpu(cpu_online_map));
cleanup_workqueue_thread(wq, hotcpu);
}
+ mutex_unlock(&workqueue_mutex);
+ break;
+
+ case CPU_DOWN_PREPARE:
+ mutex_lock(&workqueue_mutex);
+ break;
+
+ case CPU_DOWN_FAILED:
+ mutex_unlock(&workqueue_mutex);
break;
case CPU_DEAD:
@@ -604,6 +679,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
cleanup_workqueue_thread(wq, hotcpu);
list_for_each_entry(wq, &workqueues, list)
take_over_work(wq, hotcpu);
+ mutex_unlock(&workqueue_mutex);
break;
}
@@ -619,13 +695,3 @@ void init_workqueues(void)
BUG_ON(!keventd_wq);
}
-EXPORT_SYMBOL_GPL(__create_workqueue);
-EXPORT_SYMBOL_GPL(queue_work);
-EXPORT_SYMBOL_GPL(queue_delayed_work);
-EXPORT_SYMBOL_GPL(flush_workqueue);
-EXPORT_SYMBOL_GPL(destroy_workqueue);
-
-EXPORT_SYMBOL(schedule_work);
-EXPORT_SYMBOL(schedule_delayed_work);
-EXPORT_SYMBOL(schedule_delayed_work_on);
-EXPORT_SYMBOL(flush_scheduled_work);