diff options
Diffstat (limited to 'kernel')
68 files changed, 9184 insertions, 1625 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156..82fb182f6f6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,18 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_STACK_UNWIND) += unwind.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d2010..126ca43d5d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct file *); /* * This structure is used so that all the data protected by lock @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + do_acct_process(old_acct); filp_close(old_acct, NULL); spin_lock(&acct_globals.lock); } @@ -419,16 +419,15 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; struct timespec uptime; - unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->utime, - current->signal->utime)); - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->stime, - current->signal->stime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock(¤t->sighand->siglock); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac.ac_flag = pacct->ac_flag; + ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* * Kernel segment override to datasegment and write it @@ -546,12 +520,64 @@ static void do_acct_process(long exitcode, struct file *file) } /** + * acct_init_pacct - initialize a new pacct_struct + * @pacct: per-process accounting info struct to initialize + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; +} + +/** + * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. + */ +void acct_collect(long exitcode, int group_dead) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (group_dead && current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock_irq(¤t->sighand->siglock); + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); +} + +/** * acct_process - now just a wrapper around do_acct_process * @exitcode: task exit code * * handles process accounting for an exiting task */ -void acct_process(long exitcode) +void acct_process(void) { struct file *file = NULL; @@ -570,7 +596,7 @@ void acct_process(long exitcode) get_file(file); spin_unlock(&acct_globals.lock); - do_acct_process(exitcode, file); + do_acct_process(file); fput(file); } @@ -599,9 +625,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; } diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1c..82443fb433e 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,6 +56,7 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/selinux.h> +#include <linux/inotify.h> #include "audit.h" @@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; pid_t audit_sig_pid = -1; +u32 audit_sig_sid = 0; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* Hash for inode-based rules */ +struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + /* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ @@ -114,10 +122,8 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); -/* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneously in - * auditsc.c */ -DEFINE_MUTEX(audit_netlink_mutex); +/* Serialize requests from userspace. */ +static DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) "audit_rate_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_rate_limit = limit; - return old; + return 0; } static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) @@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) "audit_backlog_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_backlog_limit = limit; - return old; + return 0; } static int audit_set_enabled(int state, uid_t loginuid, u32 sid) @@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) "audit_enabled=%d old=%d by auid=%u", state, old, loginuid); audit_enabled = state; - return old; + return 0; } static int audit_set_failure(int state, uid_t loginuid, u32 sid) @@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) "audit_failure=%d old=%d by auid=%u", state, old, loginuid); audit_failure = state; - return old; + return 0; } static int kauditd_thread(void *dummy) @@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } +} + +int audit_send_list(void *_dest) +{ + struct audit_netlink_list *dest = _dest; + int pid = dest->pid; + struct sk_buff *skb; + + /* wait for parent to finish and send an ACK */ + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + while ((skb = __skb_dequeue(&dest->q)) != NULL) + netlink_unicast(audit_sock, skb, pid, 0); + + kfree(dest); + return 0; } +struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, + int multi, void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + return NULL; + + nlh = NLMSG_PUT(skb, pid, seq, t, size); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + return skb; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); + return NULL; +} + /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { struct sk_buff *skb; - struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = alloc_skb(len, GFP_KERNEL); + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) return; - - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_sock, skb, pid, 0); return; - -nlmsg_failure: /* Used by NLMSG_PUT */ - if (skb) - kfree_skb(skb); } /* @@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ - struct audit_sig_info sig_data; + struct audit_sig_info *sig_data; + char *ctx; + u32 len; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; if (sid) { - char *ctx = NULL; - u32 len; - int rc; - if ((rc = selinux_ctxid_to_string( + if ((err = selinux_ctxid_to_string( sid, &ctx, &len))) - return rc; + return err; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = status_get->pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit, + err = audit_set_rate_limit(status_get->rate_limit, loginuid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit, + err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sid); break; case AUDIT_USER: @@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) "user pid=%d uid=%u auid=%u", pid, uid, loginuid); if (sid) { - char *ctx = NULL; - u32 len; if (selinux_ctxid_to_string( sid, &ctx, &len)) { audit_log_format(ab, @@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_SIGNAL_INFO: - sig_data.uid = audit_sig_uid; - sig_data.pid = audit_sig_pid; + err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); + if (err) + return err; + sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + if (!sig_data) { + kfree(ctx); + return -ENOMEM; + } + sig_data->uid = audit_sig_uid; + sig_data->pid = audit_sig_pid; + memcpy(sig_data->ctx, ctx, len); + kfree(ctx); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, &sig_data, sizeof(sig_data)); + 0, 0, sig_data, sizeof(*sig_data) + len); + kfree(sig_data); break; default: err = -EINVAL; @@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - mutex_lock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_cmd_mutex); } +#ifdef CONFIG_AUDITSYSCALL +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; +#endif /* Initialize audit support at boot time. */ static int __init audit_init(void) { +#ifdef CONFIG_AUDITSYSCALL + int i; +#endif + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, @@ -661,6 +712,16 @@ static int __init audit_init(void) selinux_audit_set_callback(&selinux_audit_rule_update); audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + +#ifdef CONFIG_AUDITSYSCALL + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) + INIT_LIST_HEAD(&audit_inode_hash[i]); +#endif + return 0; } __initcall(audit_init); @@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab) kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) + if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); - else + else { + audit_freelist_count++; list_add(&ab->list, &audit_freelist); + } spin_unlock_irqrestore(&audit_freelist_lock, flags); } @@ -755,7 +818,7 @@ err: */ unsigned int audit_serial(void) { - static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; + static DEFINE_SPINLOCK(serial_lock); static unsigned int serial = 0; unsigned long flags; @@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } +/* + * Format a string of no more than slen characters into the audit buffer, + * enclosed in quote marks. + */ +static void audit_log_n_string(struct audit_buffer *ab, size_t slen, + const char *string) +{ + int avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = slen + 3; /* enclosing quotes + null terminator */ + if (new_len > avail) { + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + ptr = skb->tail; + *ptr++ = '"'; + memcpy(ptr, string, slen); + ptr += slen; + *ptr++ = '"'; + *ptr = 0; + skb_put(skb, slen + 2); /* don't include null terminator */ +} + /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_n_unstrustedstring - log a string that may contain random characters * @ab: audit_buffer + * @len: lenth of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). + * + * The caller specifies the number of characters in the string to log, which may + * or may not be the entire string. */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, + const char *string) { const unsigned char *p = string; while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { - audit_log_hex(ab, string, strlen(string)); - return; + audit_log_hex(ab, string, len); + return string + len + 1; } p++; } - audit_log_format(ab, "\"%s\"", string); + audit_log_n_string(ab, len, string); + return p + 1; +} + +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * determine string length. + */ +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + return audit_log_n_untrustedstring(ab, strlen(string), string); } /* This is a helper-function to print the escaped d_path */ diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd3..8323e4132a3 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -19,9 +19,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <linux/mutex.h> #include <linux/fs.h> #include <linux/audit.h> +#include <linux/skbuff.h> /* 0 = no checking 1 = put_count checking @@ -53,6 +53,18 @@ enum audit_state { }; /* Rule lists */ +struct audit_parent; + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + struct audit_field { u32 type; u32 val; @@ -70,6 +82,9 @@ struct audit_krule { u32 buflen; /* for data alloc on list rules */ u32 field_count; struct audit_field *fields; + struct audit_field *inode_f; /* quick access to an inode field */ + struct audit_watch *watch; /* associated watch */ + struct list_head rlist; /* entry in audit_watch.rules list */ }; struct audit_entry { @@ -78,15 +93,53 @@ struct audit_entry { struct audit_krule rule; }; - extern int audit_pid; -extern int audit_comparator(const u32 left, const u32 op, const u32 right); +#define AUDIT_INODE_BUCKETS 32 +extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +static inline int audit_hash_ino(u32 ino) +{ + return (ino & (AUDIT_INODE_BUCKETS-1)); +} + +extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen); +extern struct sk_buff * audit_make_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); -extern struct mutex audit_netlink_mutex; +struct audit_netlink_list { + int pid; + struct sk_buff_head q; +}; + +int audit_send_list(void *); + +struct inotify_watch; +extern void audit_free_parent(struct inotify_watch *); +extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, + const char *, struct inode *); extern int selinux_audit_rule_update(void); + +#ifdef CONFIG_AUDITSYSCALL +extern void __audit_signal_info(int sig, struct task_struct *t); +static inline void audit_signal_info(int sig, struct task_struct *t) +{ + if (unlikely(audit_pid && t->tgid == audit_pid)) + __audit_signal_info(sig, t); +} +extern enum audit_state audit_filter_inodes(struct task_struct *, + struct audit_context *); +extern void audit_set_auditable(struct audit_context *); +#else +#define audit_signal_info(s,t) +#define audit_filter_inodes(t,c) AUDIT_DISABLED +#define audit_set_auditable(c) +#endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d68..4c99d2c586e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -22,13 +22,59 @@ #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/namei.h> #include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/inotify.h> #include <linux/selinux.h> #include "audit.h" -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ +/* + * Locking model: + * + * audit_filter_mutex: + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * selinux rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. + */ + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Audit filter lists, defined in <linux/audit.h> */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), @@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #endif }; +static DEFINE_MUTEX(audit_filter_mutex); + +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +static inline void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +static void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +static void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + static inline void audit_free_rule(struct audit_entry *e) { int i; + + /* some rules don't have associated watches */ + if (e->rule.watch) + audit_put_watch(e->rule.watch); if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; @@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, + AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } +/* Translate an inode field to kernel respresentation. */ +static inline int audit_to_inode(struct audit_krule *krule, + struct audit_field *f) +{ + if (krule->listnr != AUDIT_FILTER_EXIT || + krule->watch || krule->inode_f) + return -EINVAL; + + krule->inode_f = f; + return 0; +} + +/* Translate a watch string to kernel respresentation. */ +static int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op & ~AUDIT_EQUAL || + krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ + return -EINVAL; + + watch = audit_init_watch(path); + if (unlikely(IS_ERR(watch))) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) #endif ; } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && - rule->action != AUDIT_ALWAYS) + if (unlikely(rule->action == AUDIT_POSSIBLE)) { + printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + goto exit_err; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; @@ -158,6 +332,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; + struct audit_field *f; int err = 0; int i; @@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; - if (f->type & AUDIT_UNUSED_BITS || - f->type == AUDIT_SE_USER || - f->type == AUDIT_SE_ROLE || - f->type == AUDIT_SE_TYPE || - f->type == AUDIT_SE_SEN || - f->type == AUDIT_SE_CLR) { - err = -EINVAL; + err = -EINVAL; + switch(f->type) { + default: goto exit_free; + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; } entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; @@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; + } + } + exit_nofree: return entry; @@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; + struct audit_field *f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->se_str = NULL; f->se_rule = NULL; switch(f->type) { + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_SE_USER: case AUDIT_SE_ROLE: case AUDIT_SE_TYPE: @@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } else f->se_str = str; break; + case AUDIT_WATCH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_to_watch(&entry->rule, str, f->val, f->op); + if (err) { + kfree(str); + goto exit_free; + } + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + default: + goto exit_free; + } + } + + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; } } @@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule = kmalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) - return ERR_PTR(-ENOMEM); + return NULL; memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; @@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); if (unlikely(!data)) - return ERR_PTR(-ENOMEM); + return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; @@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; + case AUDIT_WATCH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->watch->path); + break; default: data->values[i] = f->val; } @@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; + case AUDIT_WATCH: + if (strcmp(a->watch->path, b->watch->path)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (unlikely(IS_ERR(new))) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + /* Duplicate selinux field information. The se_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_selinux_field(struct audit_field *df, @@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The selinux specific fields * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old) + * rule with the new rule in the filterlist, then free the old rule. + * The rlist element is undefined; list manipulations are handled apart from + * the initial copy. */ +static struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->buflen = old->buflen; + new->inode_f = old->inode_f; + new->watch = NULL; new->field_count = old->field_count; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); @@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) } } + if (watch) { + audit_get_watch(watch); + new->watch = watch; + } + return entry; } -/* Add rule to given filterlist if not a duplicate. Protected by - * audit_netlink_mutex. */ +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + struct audit_buffer *ab; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && + audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) + audit_set_auditable(current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (unlikely(IS_ERR(nwatch))) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (unlikely(IS_ERR(nentry))) + audit_panic("error updating watch, removing"); + else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + } + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "audit updated rules specifying watch="); + audit_log_untrustedstring(ab, owatch->path); + audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); + audit_log_end(ab); + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + list_del(&r->rlist); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit implicitly removed rule from list=%d\n", + AUDIT_FILTER_EXIT); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +static void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the put matching the get in audit_do_del_rule() */ + put_inotify_watch(&p->wdata); + } +} + +/* Find an existing audit rule. + * Caller must hold audit_filter_mutex to prevent stale rule data. */ +static struct audit_entry *audit_find_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e, *found = NULL; + int h; + + if (entry->rule.watch) { + /* we don't know the inode number, so must walk entire hash */ + for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { + list = &audit_inode_hash[h]; + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + } + goto out; + } + + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + +out: + return found; +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(char *path, struct nameidata **ndp, + struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_release(ndp); + kfree(ndp); + } + if (ndw) { + path_release(ndw); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +/* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch = entry->rule.watch; + struct nameidata *ndp, *ndw; + int h, err, putnd_needed = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) - return -EEXIST; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + mutex_unlock(&audit_filter_mutex); + if (e) { + err = -EEXIST; + goto error; + } + + /* Avoid calling path_lookup under audit_filter_mutex. */ + if (watch) { + err = audit_get_nd(watch->path, &ndp, &ndw); + if (err) + goto error; + putnd_needed = 1; + } + + mutex_lock(&audit_filter_mutex); + if (watch) { + /* audit_filter_mutex is dropped and re-taken during this call */ + err = audit_add_watch(&entry->rule, ndp, ndw); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + h = audit_hash_ino((u32)watch->ino); + list = &audit_inode_hash[h]; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail_rcu(&entry->list, list); } + mutex_unlock(&audit_filter_mutex); - return 0; + if (putnd_needed) + audit_put_nd(ndp, ndw); + + return 0; + +error: + if (putnd_needed) + audit_put_nd(ndp, ndw); + if (watch) + audit_put_watch(watch); /* tmp watch, matches initial get */ + return err; } -/* Remove an existing rule from filterlist. Protected by - * audit_netlink_mutex. */ +/* Remove an existing rule from filterlist. */ static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch, *tmp_watch = entry->rule.watch; + LIST_HEAD(inotify_list); + int h, ret = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - return 0; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + if (!e) { + mutex_unlock(&audit_filter_mutex); + ret = -ENOENT; + goto out; + } + + watch = e->rule.watch; + if (watch) { + struct audit_parent *parent = watch->parent; + + list_del(&e->rule.rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). */ + list_add(&parent->ilist, &inotify_list); + get_inotify_watch(&parent->wdata); + } } } - return -ENOENT; /* No matching rule */ + + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + mutex_unlock(&audit_filter_mutex); + + if (!list_empty(&inotify_list)) + audit_inotify_unregister(&inotify_list); + +out: + if (tmp_watch) + audit_put_watch(tmp_watch); /* match initial get */ + + return ret; } /* List rules using struct audit_rule. Exists for backward * compatibility with userspace. */ -static int audit_list(void *_dest) +static void audit_list(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *entry; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { list_for_each_entry(entry, &audit_filter_list[i], list) { struct audit_rule *rule; @@ -532,33 +1177,41 @@ static int audit_list(void *_dest) rule = audit_krule_to_rule(&entry->rule); if (unlikely(!rule)) break; - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); kfree(rule); } } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(entry, &audit_inode_hash[i], list) { + struct audit_rule *rule; + + rule = audit_krule_to_rule(&entry->rule); + if (unlikely(!rule)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, + rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); + kfree(rule); + } + } + skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /* List rules using struct audit_rule_data. */ -static int audit_list_rules(void *_dest) +static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *e; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { list_for_each_entry(e, &audit_filter_list[i], list) { struct audit_rule_data *data; @@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest) data = audit_krule_to_data(&e->rule); if (unlikely(!data)) break; - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data)); + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); + if (skb) + skb_queue_tail(q, skb); kfree(data); } } - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(e, &audit_inode_hash[i], list) { + struct audit_rule_data *data; - mutex_unlock(&audit_netlink_mutex); - return 0; + data = audit_krule_to_data(&e->rule); + if (unlikely(!data)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); + if (skb) + skb_queue_tail(q, skb); + kfree(data); + } + } + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /** @@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, size_t datasz, uid_t loginuid, u32 sid) { struct task_struct *tsk; - int *dest; + struct audit_netlink_list *dest; int err = 0; struct audit_entry *entry; @@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest[0] = pid; - dest[1] = seq; + dest->pid = pid; + skb_queue_head_init(&dest->q); + mutex_lock(&audit_filter_mutex); if (type == AUDIT_LIST) - tsk = kthread_run(audit_list, dest, "audit_list"); + audit_list(pid, seq, &dest->q); else - tsk = kthread_run(audit_list_rules, dest, - "audit_list_rules"); + audit_list_rules(pid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); kfree(dest); err = PTR_ERR(tsk); } @@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); + if (sid) { char *ctx = NULL; u32 len; @@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return 0; } +/* Compare given dentry name with last component in given path, + * return of 0 indicates a match. */ +int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen) +{ + int dlen, plen; + const char *p; + if (!dname || !path) + return 1; + + dlen = strlen(dname); + plen = strlen(path); + if (plen < dlen) + return 1; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* find last path component */ + p = p - dlen + 1; + if (p < path) + return 1; + else if (p > path) { + if (*--p != '/') + return 1; + else + p++; + } + + /* return length of path's directory component */ + if (dirlen) + *dirlen = p - path; + return strncmp(p, dname, dlen); +} static int audit_filter_user_rules(struct netlink_skb_parms *cb, struct audit_krule *rule, @@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; @@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) int selinux_audit_rule_update(void) { struct audit_entry *entry, *n, *nentry; + struct audit_watch *watch; int i, err = 0; - /* audit_netlink_mutex synchronizes the writers */ - mutex_lock(&audit_netlink_mutex); + /* audit_filter_mutex synchronizes the writers */ + mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { if (!audit_rule_has_selinux(&entry->rule)) continue; - nentry = audit_dupe_rule(&entry->rule); + watch = entry->rule.watch; + nentry = audit_dupe_rule(&entry->rule, watch); if (unlikely(IS_ERR(nentry))) { /* save the first error encountered for the * return value */ if (!err) err = PTR_ERR(nentry); audit_panic("error updating selinux filters"); + if (watch) + list_del(&entry->rule.rlist); list_del_rcu(&entry->list); } else { + if (watch) { + list_add(&nentry->rule.rlist, + &watch->rules); + list_del(&entry->rule.rlist); + } list_replace_rcu(&entry->list, &nentry->list); } call_rcu(&entry->rcu, audit_free_rule_rcu); } } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_filter_mutex); return err; } + +/* Update watch data in audit rules based on inotify events. */ +void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b2..dc5e3f01efe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -3,7 +3,7 @@ * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005 IBM Corporation + * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -29,6 +29,9 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, + * 2006. + * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. * @@ -49,6 +52,7 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/socket.h> +#include <linux/mqueue.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> @@ -59,6 +63,8 @@ #include <linux/list.h> #include <linux/tty.h> #include <linux/selinux.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> #include "audit.h" @@ -76,6 +82,9 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -83,8 +92,9 @@ extern int audit_enabled; * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ unsigned long ino; - unsigned long pino; dev_t dev; umode_t mode; uid_t uid; @@ -100,6 +110,33 @@ struct audit_aux_data { #define AUDIT_AUX_IPCPERM 0 +struct audit_aux_data_mq_open { + struct audit_aux_data d; + int oflag; + mode_t mode; + struct mq_attr attr; +}; + +struct audit_aux_data_mq_sendrecv { + struct audit_aux_data d; + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; +}; + +struct audit_aux_data_mq_notify { + struct audit_aux_data d; + mqd_t mqdes; + struct sigevent notification; +}; + +struct audit_aux_data_mq_getsetattr { + struct audit_aux_data d; + mqd_t mqdes; + struct mq_attr mqstat; +}; + struct audit_aux_data_ipcctl { struct audit_aux_data d; struct ipc_perm p; @@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl { u32 osid; }; +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + char mem[0]; +}; + struct audit_aux_data_socketcall { struct audit_aux_data d; int nargs; @@ -148,7 +192,7 @@ struct audit_context { struct audit_aux_data *aux; /* Save things to print about task_struct */ - pid_t pid; + pid_t pid, ppid; uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -160,12 +204,13 @@ struct audit_context { #endif }; - +/* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, + struct audit_names *name, enum audit_state *state) { int i, j, need_sid = 1; @@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PID: result = audit_comparator(tsk->pid, f->op, f->val); break; + case AUDIT_PPID: + if (ctx) + result = audit_comparator(ctx->ppid, f->op, f->val); + break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); break; @@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (ctx) { + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (ctx) { + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_INODE: - if (ctx) { + if (name) + result = (name->ino == f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val) || - audit_comparator(ctx->names[j].pino, f->op, f->val)) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { ++result; break; } } } break; + case AUDIT_WATCH: + if (name && rule->watch->ino != (unsigned long)-1) + result = (name->dev == rule->watch->dev && + name->ino == rule->watch->ino); + break; case AUDIT_LOGINUID: result = 0; if (ctx) @@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; @@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { rcu_read_unlock(); return state; } @@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, int bit = AUDIT_BIT(ctx->major); list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall exit time, this filter is called if any audit_names[] have been + * collected during syscall processing. We only check rules in sublists at hash + * buckets applicable to the inode numbers in audit_names[]. + * Regarding audit_state, same rules apply as for audit_filter_syscall(). + */ +enum audit_state audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx) +{ + int i; + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); return state; } @@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } +void audit_set_auditable(struct audit_context *ctx) +{ + ctx->auditable = 1; +} + static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, int return_code) @@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + if (state == AUDIT_RECORD_CONTEXT) { + context->auditable = 1; + goto get_context; + } + + state = audit_filter_inodes(tsk, context); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; + } +get_context: context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context) #endif for (i = 0; i < context->name_count; i++) { - if (context->names[i].name) + if (context->names[i].name && context->names[i].name_put) __putname(context->names[i].name); } context->name_count = 0; @@ -544,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab) return; error_path: - if (ctx) - kfree(ctx); + kfree(ctx); audit_panic("error in audit_log_task_context"); return; } @@ -606,7 +719,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], @@ -614,6 +727,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[2], context->argv[3], context->name_count, + context->ppid, context->pid, context->loginuid, context->uid, @@ -630,11 +744,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { + case AUDIT_MQ_OPEN: { + struct audit_aux_data_mq_open *axi = (void *)aux; + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + axi->oflag, axi->mode, axi->attr.mq_flags, + axi->attr.mq_maxmsg, axi->attr.mq_msgsize, + axi->attr.mq_curmsgs); + break; } + + case AUDIT_MQ_SENDRECV: { + struct audit_aux_data_mq_sendrecv *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + axi->mqdes, axi->msg_len, axi->msg_prio, + axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); + break; } + + case AUDIT_MQ_NOTIFY: { + struct audit_aux_data_mq_notify *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d sigev_signo=%d", + axi->mqdes, + axi->notification.sigev_signo); + break; } + + case AUDIT_MQ_GETSETATTR: { + struct audit_aux_data_mq_getsetattr *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + axi->mqdes, + axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, + axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); + break; } + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + "ouid=%u ogid=%u mode=%x", + axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; u32 len; @@ -652,19 +803,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else - audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + break; } + + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + int i; + const char *p; + for (i = 0, p = axi->mem; i < axi->argc; i++) { + audit_log_format(ab, "a%d=", i); + p = audit_log_untrustedstring(ab, p); + audit_log_format(ab, "\n"); } break; } @@ -700,8 +850,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { - unsigned long ino = context->names[i].ino; - unsigned long pino = context->names[i].pino; + struct audit_names *n = &context->names[i]; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) @@ -709,33 +858,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "item=%d", i); - audit_log_format(ab, " name="); - if (context->names[i].name) - audit_log_untrustedstring(ab, context->names[i].name); - else - audit_log_format(ab, "(null)"); - - if (pino != (unsigned long)-1) - audit_log_format(ab, " parent=%lu", pino); - if (ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu", ino); - if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) - audit_log_format(ab, " dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), - MINOR(context->names[i].rdev)); - if (context->names[i].osid != 0) { + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", context->pwd, + context->pwdmnt); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name_len, + n->name); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { char *ctx = NULL; u32 len; if (selinux_ctxid_to_string( - context->names[i].osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - context->names[i].osid); + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; } else audit_log_format(ab, " obj=%s", ctx); @@ -908,11 +1071,11 @@ void audit_syscall_exit(int valid, long return_code) * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void audit_getname(const char *name) +void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - if (!context || IS_ERR(name) || !name) + if (IS_ERR(name) || !name) return; if (!context->in_syscall) { @@ -925,6 +1088,8 @@ void audit_getname(const char *name) } BUG_ON(context->name_count >= AUDIT_NAMES); context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; if (!context->pwd) { @@ -991,11 +1156,10 @@ static void audit_inode_context(int idx, const struct inode *inode) * audit_inode - store the inode and device from a lookup * @name: name being audited * @inode: inode being audited - * @flags: lookup flags (as used in path_lookup()) * * Called from fs/namei.c:path_lookup(). */ -void __audit_inode(const char *name, const struct inode *inode, unsigned flags) +void __audit_inode(const char *name, const struct inode *inode) { int idx; struct audit_context *context = current->audit_context; @@ -1021,20 +1185,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } + context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && - (strcmp(name, ".") != 0)) { - context->names[idx].ino = (unsigned long)-1; - context->names[idx].pino = inode->i_ino; - } else { - context->names[idx].ino = inode->i_ino; - context->names[idx].pino = (unsigned long)-1; - } } /** @@ -1056,51 +1213,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, { int idx; struct audit_context *context = current->audit_context; + const char *found_name = NULL; + int dirlen = 0; if (!context->in_syscall) return; /* determine matching parent */ - if (dname) - for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { - const char *n; - const char *name = context->names[idx].name; - int dlen = strlen(dname); - int nlen = name ? strlen(name) : 0; - - if (nlen < dlen) - continue; - - /* disregard trailing slashes */ - n = name + nlen - 1; - while ((*n == '/') && (n > name)) - n--; - - /* find last path component */ - n = n - dlen + 1; - if (n < name) - continue; - else if (n > name) { - if (*--n != '/') - continue; - else - n++; - } - - if (strncmp(n, dname, dlen) == 0) - goto update_context; + if (!dname) + goto update_context; + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].ino == pino) { + const char *name = context->names[idx].name; + + if (!name) + continue; + + if (audit_compare_dname_path(dname, name, &dirlen) == 0) { + context->names[idx].name_len = dirlen; + found_name = name; + break; } + } - /* catch-all in case match not found */ +update_context: idx = context->name_count++; - context->names[idx].name = NULL; - context->names[idx].pino = pino; #if AUDIT_DEBUG context->ino_count++; #endif + /* Re-use the name belonging to the slot for a matching parent directory. + * All names for this context are relinquished in audit_free_names() */ + context->names[idx].name = found_name; + context->names[idx].name_len = AUDIT_NAME_FULL; + context->names[idx].name_put = 0; /* don't call __putname() */ -update_context: if (inode) { context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; @@ -1109,7 +1255,8 @@ update_context: context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - } + } else + context->names[idx].ino = (unsigned long)-1; } /** @@ -1142,18 +1289,23 @@ void auditsc_get_stamp(struct audit_context *ctx, */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (task->audit_context) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u", - task->pid, task->uid, - task->audit_context->loginuid, loginuid); - audit_log_end(ab); + struct audit_context *context = task->audit_context; + + if (context) { + /* Only log if audit is enabled */ + if (context->in_syscall) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u", + task->pid, task->uid, + context->loginuid, loginuid); + audit_log_end(ab); + } } - task->audit_context->loginuid = loginuid; + context->loginuid = loginuid; } return 0; } @@ -1170,16 +1322,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx) } /** - * audit_ipc_obj - record audit data for ipc object - * @ipcp: ipc permissions + * __audit_mq_open - record audit data for a POSIX MQ open + * @oflag: open flag + * @mode: mode bits + * @u_attr: queue attributes * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_obj(struct kern_ipc_perm *ipcp) +int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) { - struct audit_aux_data_ipcctl *ax; + struct audit_aux_data_mq_open *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_attr != NULL) { + if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->attr, 0, sizeof(ax->attr)); + + ax->oflag = oflag; + ax->mode = mode; + + ax->d.type = AUDIT_MQ_OPEN; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @u_abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + ax->msg_prio = msg_prio; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive + * @mqdes: MQ descriptor + * @msg_len: Message length + * @u_msg_prio: Message priority + * @u_abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, + unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_msg_prio != NULL) { + if (get_user(ax->msg_prio, u_msg_prio)) { + kfree(ax); + return -EFAULT; + } + } else + ax->msg_prio = 0; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_notify - record audit data for a POSIX MQ notify + * @mqdes: MQ descriptor + * @u_notification: Notification event + * + * Returns 0 for success or NULL context or < 0 on error. + */ + +int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +{ + struct audit_aux_data_mq_notify *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_notification != NULL) { + if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->notification, 0, sizeof(ax->notification)); + + ax->mqdes = mqdes; + + ax->d.type = AUDIT_MQ_NOTIFY; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute + * @mqdes: MQ descriptor + * @mqstat: MQ flags + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; + if (!audit_enabled) + return 0; + if (likely(!context)) return 0; @@ -1187,6 +1516,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) if (!ax) return -ENOMEM; + ax->mqdes = mqdes; + ax->mqstat = *mqstat; + + ax->d.type = AUDIT_MQ_GETSETATTR; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * audit_ipc_obj - record audit data for ipc object + * @ipcp: ipc permissions + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +{ + struct audit_aux_data_ipcctl *ax; + struct audit_context *context = current->audit_context; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + ax->uid = ipcp->uid; ax->gid = ipcp->gid; ax->mode = ipcp->mode; @@ -1207,14 +1560,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) +int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1223,7 +1573,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, ax->uid = uid; ax->gid = gid; ax->mode = mode; - selinux_get_ipc_sid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC_SET_PERM; ax->d.next = context->aux; @@ -1231,6 +1580,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, return 0; } +int audit_bprm(struct linux_binprm *bprm) +{ + struct audit_aux_data_execve *ax; + struct audit_context *context = current->audit_context; + unsigned long p, next; + void *to; + + if (likely(!audit_enabled || !context)) + return 0; + + ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, + GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { + struct page *page = bprm->page[p / PAGE_SIZE]; + void *kaddr = kmap(page); + next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); + memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); + to += next - p; + kunmap(page); + } + + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + + /** * audit_socketcall - record audit data for sys_socketcall * @nargs: number of args @@ -1325,19 +1707,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -void audit_signal_info(int sig, struct task_struct *t) +void __audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - - if (unlikely(audit_pid && t->tgid == audit_pid)) { - if (sig == SIGTERM || sig == SIGHUP) { - struct audit_context *ctx = current->audit_context; - audit_sig_pid = current->pid; - if (ctx) - audit_sig_uid = ctx->loginuid; - else - audit_sig_uid = current->uid; - } + extern u32 audit_sig_sid; + + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; + audit_sig_pid = tsk->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = tsk->uid; + selinux_get_task_sid(tsk, &audit_sig_sid); } } diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d..126dee9530a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include <linux/unistd.h> #include <linux/security.h> #include <linux/timex.h> +#include <linux/migrate.h> #include <asm/uaccess.h> @@ -729,17 +730,10 @@ void sigset_from_compat (sigset_t *set, compat_sigset_t *compat) { switch (_NSIG_WORDS) { -#if defined (__COMPAT_ENDIAN_SWAP__) - case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); - case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); - case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); - case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); -#else case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); -#endif } } @@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) return ret; } + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + compat_uptr_t __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4..70fbf2e8376 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -13,12 +13,12 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/stop_machine.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> /* This protects CPUs going up and down... */ -static DECLARE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpucontrol); -static BLOCKING_NOTIFIER_HEAD(cpu_chain); +static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *lock_cpu_hotplug_owner; @@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) if (lock_cpu_hotplug_owner != current) { if (interruptible) - ret = down_interruptible(&cpucontrol); + ret = mutex_lock_interruptible(&cpucontrol); else - down(&cpucontrol); + mutex_lock(&cpucontrol); } /* @@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) { if (--lock_cpu_hotplug_depth == 0) { lock_cpu_hotplug_owner = NULL; - up(&cpucontrol); + mutex_unlock(&cpucontrol); } } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); @@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int register_cpu_notifier(struct notifier_block *nb) +int __cpuinit register_cpu_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&cpu_chain, nb); } + +#ifdef CONFIG_HOTPLUG_CPU + EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) @@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -#ifdef CONFIG_HOTPLUG_CPU static inline void check_for_tasks(int cpu) { struct task_struct *p; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572..1535af3a912 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -41,6 +41,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/security.h> #include <linux/slab.h> #include <linux/smp_lock.h> #include <linux/spinlock.h> @@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { @@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); @@ -2434,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct pid *pid; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; - tsk = m->private; + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); } struct file_operations proc_cpuset_operations = { diff --git a/kernel/exit.c b/kernel/exit.c index e06d0c10a24..ab06b9f88f6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -36,6 +36,7 @@ #include <linux/compat.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ +#include <linux/resource.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -45,8 +46,6 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -int getrusage(struct task_struct *, int, struct rusage __user *); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -138,12 +137,8 @@ void release_task(struct task_struct * p) { int zap_leader; task_t *leader; - struct dentry *proc_dentry; - repeat: atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); @@ -172,8 +167,7 @@ repeat: sched_exit(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + proc_flush_task(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -579,7 +573,7 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - if (mm != tsk->active_mm) BUG(); + BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; @@ -895,11 +889,11 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_process(code); } + acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); -#ifdef CONFIG_COMPAT +#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif @@ -907,6 +901,8 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); exit_mm(tsk); + if (group_dead) + acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); @@ -930,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code) tsk->mempolicy = NULL; #endif /* + * This must happen late, after the PID is not + * hashed anymore: + */ + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); + /* * If DEBUG_MUTEXES is on, make sure we are holding no locks: */ mutex_debug_check_no_locks_held(tsk); + rt_mutex_debug_check_no_locks_held(tsk); if (tsk->io_context) exit_io_context(); @@ -1530,8 +1535,7 @@ check_continued: if (options & __WNOTHREAD) break; tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); + BUG_ON(tsk->signal != current->signal); } while (tsk != current); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088..628198a4f28 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; void free_task(struct task_struct *tsk) { free_thread_info(tsk->thread_info); + rt_mutex_debug_task_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -368,6 +369,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); @@ -623,6 +626,7 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { @@ -631,6 +635,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -744,7 +749,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -871,6 +875,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } @@ -909,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) return current->pid; } +static inline void rt_mutex_init_task(struct task_struct *p) +{ +#ifdef CONFIG_RT_MUTEXES + spin_lock_init(&p->pi_lock); + plist_head_init(&p->pi_waiters, &p->pi_lock); + p->pi_blocked_on = NULL; +# ifdef CONFIG_DEBUG_RT_MUTEXES + spin_lock_init(&p->held_list_lock); + INIT_LIST_HEAD(&p->held_list_head); +# endif +#endif +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -989,13 +1007,10 @@ static task_t *copy_process(unsigned long clone_flags, if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup; - p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); @@ -1033,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, mpol_fix_fork_child_flag(p); #endif + rt_mutex_init_task(p); + #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif @@ -1075,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, #ifdef CONFIG_COMPAT p->compat_robust_list = NULL; #endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; + /* * sigaltstack should be cleared when sharing the same VM */ @@ -1155,18 +1175,6 @@ static task_t *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057..6c91f938005 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -12,6 +12,10 @@ * (C) Copyright 2006 Red Hat Inc, All Rights Reserved * Thanks to Thomas Gleixner for suggestions, analysis and fixes. * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -46,6 +50,8 @@ #include <linux/signal.h> #include <asm/futex.h> +#include "rtmutex_common.h" + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -63,7 +69,7 @@ union futex_key { int offset; } shared; struct { - unsigned long uaddr; + unsigned long address; struct mm_struct *mm; int offset; } private; @@ -75,6 +81,27 @@ union futex_key { }; /* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + +/* * We use this hashed waitqueue instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * @@ -87,15 +114,19 @@ struct futex_q { struct list_head list; wait_queue_head_t waiters; - /* Which hash list lock to use. */ + /* Which hash list lock to use: */ spinlock_t *lock_ptr; - /* Key which the futex is hashed on. */ + /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these. */ + /* For fd, sigio sent using these: */ int fd; struct file *filp; + + /* Optional priority inheritance state: */ + struct futex_pi_state *pi_state; + struct task_struct *task; }; /* @@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ -static int get_futex_key(unsigned long uaddr, union futex_key *key) +static int get_futex_key(u32 __user *uaddr, union futex_key *key) { + unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct page *page; @@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * The futex address must be "naturally" aligned. */ - key->both.offset = uaddr % PAGE_SIZE; + key->both.offset = address % PAGE_SIZE; if (unlikely((key->both.offset % sizeof(u32)) != 0)) return -EINVAL; - uaddr -= key->both.offset; + address -= key->both.offset; /* * The futex is hashed differently depending on whether * it's in a shared or private mapping. So check vma first. */ - vma = find_extend_vma(mm, uaddr); + vma = find_extend_vma(mm, address); if (unlikely(!vma)) return -EFAULT; @@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) */ if (likely(!(vma->vm_flags & VM_MAYSHARE))) { key->private.mm = mm; - key->private.uaddr = uaddr; + key->private.address = address; return 0; } @@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) key->shared.inode = vma->vm_file->f_dentry->d_inode; key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { - key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); return 0; } @@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) * from swap. But that's a lot of code to duplicate here * for a rare case, so we simply fetch the page. */ - err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); + err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); if (err >= 0) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key) } } -static inline int get_futex_value_locked(int *dest, int __user *from) +static inline int get_futex_value_locked(u32 *dest, u32 __user *from) { int ret; inc_preempt_count(); - ret = __copy_from_user_inatomic(dest, from, sizeof(int)); + ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); dec_preempt_count(); return ret ? -EFAULT : 0; } /* + * Fault handling. Called with current->mm->mmap_sem held. + */ +static int futex_handle_fault(unsigned long address, int attempt) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + if (attempt >= 2 || !(vma = find_vma(mm, address)) || + vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) + return -EFAULT; + + switch (handle_mm_fault(mm, vma, address, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + return -EFAULT; + } + return 0; +} + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + memset(pi_state, 0, sizeof(*pi_state)); + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid)) { + p = NULL; + goto out_unlock; + } + if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + p = NULL; + goto out_unlock; + } + get_task_struct(p); +out_unlock: + read_unlock(&tasklist_lock); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct futex_hash_bucket *hb; + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + union futex_key key; + + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselfs + */ + spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + spin_unlock_irq(&curr->pi_lock); + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + list_del_init(&pi_state->list); + + WARN_ON(pi_state->owner != curr); + + pi_state->owner = NULL; + spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + spin_lock_irq(&curr->pi_lock); + } + spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct list_head *head; + struct task_struct *p; + pid_t pid; + + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &me->key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + atomic_inc(&pi_state->refcount); + me->pi_state = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and + * attach the new pi_state to it: + */ + pid = uval & FUTEX_TID_MASK; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = me->key; + + spin_lock_irq(&p->pi_lock); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + me->pi_state = pi_state; + + return 0; +} + +/* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ @@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q) q->lock_ptr = NULL; } +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 curval, newval; + + if (!pi_state) + return -EINVAL; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * This happens when we have stolen the lock and the original + * pending owner did not enqueue itself back on the rt_mutex. + * Thats not a tragedy. We know that way, that a lock waiter + * is on the fly. We make the futex_q waiter the pending owner. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + + list_del_init(&pi_state->owner->pi_state_list); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 oldval; + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + inc_preempt_count(); + oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); + dec_preempt_count(); + + if (oldval == -EFAULT) + return oldval; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(unsigned long uaddr, int nr_wake) +static int futex_wake(u32 __user *uaddr, int nr_wake) { - union futex_key key; - struct futex_hash_bucket *bh; - struct list_head *head; + struct futex_hash_bucket *hb; struct futex_q *this, *next; + struct list_head *head; + union futex_key key; int ret; down_read(¤t->mm->mmap_sem); @@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake) if (unlikely(ret != 0)) goto out; - bh = hash_futex(&key); - spin_lock(&bh->lock); - head = &bh->chain; + hb = hash_futex(&key); + spin_lock(&hb->lock); + head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { + if (this->pi_state) + return -EINVAL; wake_futex(this); if (++ret >= nr_wake) break; } } - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -324,10 +648,12 @@ out: * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +static int +futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head; struct futex_q *this, *next; int ret, op_ret, attempt = 0; @@ -342,27 +668,29 @@ retryfull: if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); retry: - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - int dummy; + u32 dummy; - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); #ifndef CONFIG_MMU - /* we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking */ + /* + * we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking + */ ret = op_ret; goto out; #endif @@ -372,47 +700,34 @@ retry: goto out; } - /* futex_atomic_op_inuser needs to both read and write + /* + * futex_atomic_op_inuser needs to both read and write * *(int __user *)uaddr2, but we can't modify it * non-atomically. Therefore, if get_user below is not * enough, we need to handle the fault ourselves, while - * still holding the mmap_sem. */ + * still holding the mmap_sem. + */ if (attempt++) { - struct vm_area_struct * vma; - struct mm_struct *mm = current->mm; - - ret = -EFAULT; - if (attempt >= 2 || - !(vma = find_vma(mm, uaddr2)) || - vma->vm_start > uaddr2 || - !(vma->vm_flags & VM_WRITE)) - goto out; - - switch (handle_mm_fault(mm, vma, uaddr2, 1)) { - case VM_FAULT_MINOR: - current->min_flt++; - break; - case VM_FAULT_MAJOR: - current->maj_flt++; - break; - default: + if (futex_handle_fault((unsigned long)uaddr2, + attempt)) goto out; - } goto retry; } - /* If we would have faulted, release mmap_sem, - * fault it in and start all over again. */ + /* + * If we would have faulted, release mmap_sem, + * fault it in and start all over again. + */ up_read(¤t->mm->mmap_sem); - ret = get_user(dummy, (int __user *)uaddr2); + ret = get_user(dummy, uaddr2); if (ret) return ret; goto retryfull; } - head = &bh1->chain; + head = &hb1->chain; list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { @@ -423,7 +738,7 @@ retry: } if (op_ret > 0) { - head = &bh2->chain; + head = &hb2->chain; op_ret = 0; list_for_each_entry_safe(this, next, head, list) { @@ -436,9 +751,9 @@ retry: ret += op_ret; } - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -448,11 +763,11 @@ out: * Requeue all waiters hashed on one physical page to another * physical page. */ -static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, - int nr_wake, int nr_requeue, int *valp) +static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, + int nr_wake, int nr_requeue, u32 *cmpval) { union futex_key key1, key2; - struct futex_hash_bucket *bh1, *bh2; + struct futex_hash_bucket *hb1, *hb2; struct list_head *head1; struct futex_q *this, *next; int ret, drop_count = 0; @@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, if (unlikely(ret != 0)) goto out; - bh1 = hash_futex(&key1); - bh2 = hash_futex(&key2); + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); - if (bh1 < bh2) - spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); + if (hb1 < hb2) + spin_lock(&hb1->lock); + spin_lock(&hb2->lock); + if (hb1 > hb2) + spin_lock(&hb1->lock); - if (likely(valp != NULL)) { - int curval; + if (likely(cmpval != NULL)) { + u32 curval; - ret = get_futex_value_locked(&curval, (int __user *)uaddr1); + ret = get_futex_value_locked(&curval, uaddr1); if (unlikely(ret)) { - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); - /* If we would have faulted, release mmap_sem, fault + /* + * If we would have faulted, release mmap_sem, fault * it in and start all over again. */ up_read(¤t->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr1); + ret = get_user(curval, uaddr1); if (!ret) goto retry; return ret; } - if (curval != *valp) { + if (curval != *cmpval) { ret = -EAGAIN; goto out_unlock; } } - head1 = &bh1->chain; + head1 = &hb1->chain; list_for_each_entry_safe(this, next, head1, list) { if (!match_futex (&this->key, &key1)) continue; if (++ret <= nr_wake) { wake_futex(this); } else { - list_move_tail(&this->list, &bh2->chain); - this->lock_ptr = &bh2->lock; + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(head1 != &hb2->chain)) { + list_move_tail(&this->list, &hb2->chain); + this->lock_ptr = &hb2->lock; + } this->key = key2; get_key_refs(&key2); drop_count++; if (ret - nr_wake >= nr_requeue) break; - /* Make sure to stop if key1 == key2 */ - if (head1 == &bh2->chain && head1 != &next->list) - head1 = &this->list; } } out_unlock: - spin_unlock(&bh1->lock); - if (bh1 != bh2) - spin_unlock(&bh2->lock); + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); /* drop_key_refs() must be called outside the spinlocks. */ while (--drop_count >= 0) @@ -543,7 +862,7 @@ out: static inline struct futex_hash_bucket * queue_lock(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; + struct futex_hash_bucket *hb; q->fd = fd; q->filp = filp; @@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) init_waitqueue_head(&q->waiters); get_key_refs(&q->key); - bh = hash_futex(&q->key); - q->lock_ptr = &bh->lock; + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; - spin_lock(&bh->lock); - return bh; + spin_lock(&hb->lock); + return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { - list_add_tail(&q->list, &bh->chain); - spin_unlock(&bh->lock); + list_add_tail(&q->list, &hb->chain); + q->task = current; + spin_unlock(&hb->lock); } static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) { - spin_unlock(&bh->lock); + spin_unlock(&hb->lock); drop_key_refs(&q->key); } @@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) /* The key must be already stored in q->key. */ static void queue_me(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh; - bh = queue_lock(q, fd, filp); - __queue_me(q, bh); + struct futex_hash_bucket *hb; + + hb = queue_lock(q, fd, filp); + __queue_me(q, hb); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { - int ret = 0; spinlock_t *lock_ptr; + int ret = 0; /* In the common case we don't take the spinlock, which is nice. */ retry: @@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q) } WARN_ON(list_empty(&q->list)); list_del(&q->list); + + BUG_ON(q->pi_state); + spin_unlock(lock_ptr); ret = 1; } @@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q) return ret; } -static int futex_wait(unsigned long uaddr, int val, unsigned long time) +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock is held on entry and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) { - DECLARE_WAITQUEUE(wait, current); - int ret, curval; + WARN_ON(list_empty(&q->list)); + list_del(&q->list); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(&hb->lock); + + drop_key_refs(&q->key); +} + +static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) +{ + struct task_struct *curr = current; + DECLARE_WAITQUEUE(wait, curr); + struct futex_hash_bucket *hb; struct futex_q q; - struct futex_hash_bucket *bh; + u32 uval; + int ret; + q.pi_state = NULL; retry: - down_read(¤t->mm->mmap_sem); + down_read(&curr->mm->mmap_sem); ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) goto out_release_sem; - bh = queue_lock(&q, -1, NULL); + hb = queue_lock(&q, -1, NULL); /* * Access the page AFTER the futex is queued. @@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) * We hold the mmap semaphore, so the mapping cannot have changed * since we looked it up in get_futex_key. */ - - ret = get_futex_value_locked(&curval, (int __user *)uaddr); + ret = get_futex_value_locked(&uval, uaddr); if (unlikely(ret)) { - queue_unlock(&q, bh); + queue_unlock(&q, hb); - /* If we would have faulted, release mmap_sem, fault it in and + /* + * If we would have faulted, release mmap_sem, fault it in and * start all over again. */ - up_read(¤t->mm->mmap_sem); + up_read(&curr->mm->mmap_sem); - ret = get_user(curval, (int __user *)uaddr); + ret = get_user(uval, uaddr); if (!ret) goto retry; return ret; } - if (curval != val) { - ret = -EWOULDBLOCK; - queue_unlock(&q, bh); - goto out_release_sem; - } + ret = -EWOULDBLOCK; + if (uval != val) + goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, bh); + __queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we * don't want to hold mmap_sem while we sleep. - */ - up_read(¤t->mm->mmap_sem); + */ + up_read(&curr->mm->mmap_sem); /* * There might have been scheduling since the queue_me(), as we @@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) return 0; if (time == 0) return -ETIMEDOUT; - /* We expect signal_pending(current), but another thread may - * have handled it for us already. */ + /* + * We expect signal_pending(current), but another thread may + * have handled it for us already. + */ return -EINTR; + out_unlock_release_sem: + queue_unlock(&q, hb); + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; +} + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, + struct hrtimer_sleeper *to) +{ + struct task_struct *curr = current; + struct futex_hash_bucket *hb; + u32 uval, newval, curval; + struct futex_q q; + int ret, attempt = 0; + + if (refill_pi_state_cache()) + return -ENOMEM; + + q.pi_state = NULL; + retry: + down_read(&curr->mm->mmap_sem); + + ret = get_futex_key(uaddr, &q.key); + if (unlikely(ret != 0)) + goto out_release_sem; + + hb = queue_lock(&q, -1, NULL); + + retry_locked: + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = current->pid; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + + /* We own the lock already */ + if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (!detect && 0) + force_sig(SIGKILL, current); + ret = -EDEADLK; + goto out_unlock_release_sem; + } + + /* + * Surprise - we got the lock. Just return + * to userspace: + */ + if (unlikely(!curval)) + goto out_unlock_release_sem; + + uval = curval; + newval = uval | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, &q); + + if (unlikely(ret)) { + /* + * There were no waiters and the owner task lookup + * failed. When the OWNER_DIED bit is set, then we + * know that this is a robust futex and we actually + * take the lock. This is safe as we are protected by + * the hash bucket lock. We also set the waiters bit + * unconditionally here, to simplify glibc handling of + * multiple tasks racing to acquire the lock and + * cleanup the problems which were left by the dead + * owner. + */ + if (curval & FUTEX_OWNER_DIED) { + uval = newval; + newval = current->pid | + FUTEX_OWNER_DIED | FUTEX_WAITERS; + + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + dec_preempt_count(); + + if (unlikely(curval == -EFAULT)) + goto uaddr_faulted; + if (unlikely(curval != uval)) + goto retry_locked; + ret = 0; + } + goto out_unlock_release_sem; + } + + /* + * Only actually queue now that the atomic ops are done: + */ + __queue_me(&q, hb); + + /* + * Now the futex is queued and we have checked the data, we + * don't want to hold mmap_sem while we sleep. + */ + up_read(&curr->mm->mmap_sem); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + down_read(&curr->mm->mmap_sem); + hb = queue_lock(&q, -1, NULL); + + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (!ret && q.pi_state->owner != curr) { + u32 newtid = current->pid | FUTEX_WAITERS; + + /* Owner died? */ + if (q.pi_state->owner != NULL) { + spin_lock_irq(&q.pi_state->owner->pi_lock); + list_del_init(&q.pi_state->list); + spin_unlock_irq(&q.pi_state->owner->pi_lock); + } else + newtid |= FUTEX_OWNER_DIED; + + q.pi_state->owner = current; + + spin_lock_irq(¤t->pi_lock); + list_add(&q.pi_state->list, ¤t->pi_state_list); + spin_unlock_irq(¤t->pi_lock); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + /* + * We own it, so we have to replace the pending owner + * TID. This must be atomic as we have preserve the + * owner died bit here. + */ + ret = get_user(uval, uaddr); + while (!ret) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + curval = futex_atomic_cmpxchg_inatomic(uaddr, + uval, newval); + if (curval == -EFAULT) + ret = -EFAULT; + if (curval == uval) + break; + uval = curval; + } + } else { + /* + * Catch the rare case, where the lock was released + * when we were on the way back before we locked + * the hash bucket. + */ + if (ret && q.pi_state->owner == curr) { + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + } + /* Unqueue and drop the lock */ + unqueue_me_pi(&q, hb); + up_read(&curr->mm->mmap_sem); + } + + if (!detect && ret == -EDEADLK && 0) + force_sig(SIGKILL, current); + + return ret; + + out_unlock_release_sem: + queue_unlock(&q, hb); + + out_release_sem: + up_read(&curr->mm->mmap_sem); + return ret; + + uaddr_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock_release_sem; + + goto retry_locked; + } + + queue_unlock(&q, hb); + up_read(&curr->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + + return ret; +} + +/* + * Restart handler + */ +static long futex_lock_pi_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper timeout, *to = NULL; + int ret; + + restart->fn = do_no_restart_syscall; + + if (restart->arg2 || restart->arg3) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | + (u64) restart->arg0; + } + + pr_debug("lock_pi restart: %p, %d (%d)\n", + (u32 __user *)restart->arg0, current->pid); + + ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, + 0, to); + + if (ret != -EINTR) + return ret; + + restart->fn = futex_lock_pi_restart; + + /* The other values are filled in */ + return -ERESTART_RESTARTBLOCK; +} + +/* + * Called from the syscall entry below. + */ +static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, + long nsec, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + int ret; + + if (sec != MAX_SCHEDULE_TIMEOUT) { + to = &timeout; + hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); + hrtimer_init_sleeper(to, current); + to->timer.expires = ktime_set(sec, nsec); + } + + ret = do_futex_lock_pi(uaddr, detect, trylock, to); + + if (ret != -EINTR) + return ret; + + pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->arg0 = (unsigned long) uaddr; + restart->arg1 = detect; + if (to) { + restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; + restart->arg3 = to->timer.expires.tv64 >> 32; + } else + restart->arg2 = restart->arg3 = 0; + + return -ERESTART_RESTARTBLOCK; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + u32 uval; + struct list_head *head; + union futex_key key; + int ret, attempt = 0; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != current->pid) + return -EPERM; + /* + * First take all the futex related locks: + */ + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr, &key); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + +retry_locked: + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + + if (unlikely(uval == -EFAULT)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == current->pid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + +out_unlock: + spin_unlock(&hb->lock); +out: up_read(¤t->mm->mmap_sem); + + return ret; + +pi_faulted: + /* + * We have to r/w *(int __user *)uaddr, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. + */ + if (attempt++) { + if (futex_handle_fault((unsigned long)uaddr, attempt)) + goto out_unlock; + + goto retry_locked; + } + + spin_unlock(&hb->lock); + up_read(¤t->mm->mmap_sem); + + ret = get_user(uval, uaddr); + if (!ret && (uval != -EFAULT)) + goto retry; + return ret; } @@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp) unqueue_me(q); kfree(q); + return 0; } @@ -766,7 +1519,7 @@ static struct file_operations futex_fops = { * Signal allows caller to avoid the race which would occur if they * set the sigio stuff up afterwards. */ -static int futex_fd(unsigned long uaddr, int signal) +static int futex_fd(u32 __user *uaddr, int signal) { struct futex_q *q; struct file *filp; @@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal) err = -ENOMEM; goto error; } + q->pi_state = NULL; down_read(¤t->mm->mmap_sem); err = get_futex_key(uaddr, &q->key); @@ -840,7 +1594,7 @@ error: * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the - * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after @@ -915,7 +1669,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) { - u32 uval; + u32 uval, nval; retry: if (get_user(uval, uaddr)) @@ -932,12 +1686,16 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - if (futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED) != uval) + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, + uval | FUTEX_OWNER_DIED); + if (nval == -EFAULT) + return -1; + + if (nval != uval) goto retry; if (uval & FUTEX_WAITERS) - futex_wake((unsigned long)uaddr, 1); + futex_wake(uaddr, 1); } return 0; } @@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr) while (entry != &head->list) { /* * A pending lock might already be on the list, so - * dont process it twice: + * don't process it twice: */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, @@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr) } } -long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, - unsigned long uaddr2, int val2, int val3) +long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, + u32 __user *uaddr2, u32 val2, u32 val3) { int ret; @@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); break; + case FUTEX_LOCK_PI: + ret = futex_lock_pi(uaddr, val, timeout, val2, 0); + break; + case FUTEX_UNLOCK_PI: + ret = futex_unlock_pi(uaddr); + break; + case FUTEX_TRYLOCK_PI: + ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); + break; default: ret = -ENOSYS; } @@ -1031,36 +1798,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, } -asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, +asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, struct timespec __user *utime, u32 __user *uaddr2, - int val3) + u32 val3) { struct timespec t; unsigned long timeout = MAX_SCHEDULE_TIMEOUT; - int val2 = 0; + u32 val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (copy_from_user(&t, utime, sizeof(t)) != 0) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } /* * requeue parameter in 'utime' if op == FUTEX_REQUEUE. */ - if (op >= FUTEX_REQUEUE) - val2 = (int) (unsigned long) utime; + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) + val2 = (u32) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } -static struct super_block * -futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); } static struct file_system_type futex_fs_type = { diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d1..d1d92b441fb 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout = MAX_SCHEDULE_TIMEOUT; int val2 = 0; - if (utime && (op == FUTEX_WAIT)) { + if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { if (get_compat_timespec(&t, utime)) return -EFAULT; if (!timespec_valid(&t)) return -EINVAL; - timeout = timespec_to_jiffies(&t) + 1; + if (op == FUTEX_WAIT) + timeout = timespec_to_jiffies(&t) + 1; + else { + timeout = t.tv_sec; + val2 = t.tv_nsec; + } } - if (op >= FUTEX_REQUEUE) + if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) val2 = (int) (unsigned long) utime; - return do_futex((unsigned long)uaddr, op, val, timeout, - (unsigned long)uaddr2, val2, val3); + return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01fa2ae98a8..8d3dc29ef41 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = /** * ktime_get_ts - get the monotonic clock in timespec format - * * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) # ifndef CONFIG_KTIME_SCALAR /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /** * hrtimer_forward - forward the timer expiry - * * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward @@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /* @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) /** * hrtimer_start - (re)start an relative timer on the current CPU - * * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); /** * hrtimer_try_to_cancel - try to deactivate a timer - * * @timer: hrtimer to stop * * Returns: * 0 when the timer was not active * 1 when the timer was active * -1 when the timer is currently excuting the callback function and - * can not be stopped + * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * * @timer: the timer to be cancelled * * Returns: @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer - * * @timer: the timer to read */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) /** * hrtimer_init - initialize a timer to the given clock - * * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel @@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_get_res - get the timer resolution for a clock - * * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * @@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; @@ -842,7 +833,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int hrtimer_cpu_notify(struct notifier_block *self, +static int __devinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -866,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block hrtimers_nb = { +static struct notifier_block __devinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db..00000000000 --- a/kernel/intermodule.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Deprecated, do not use. Moved from module.c to here. --RR */ - -/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/spinlock.h> -#include <linux/list.h> -#include <linux/slab.h> - -/* inter_module functions are always available, even when the kernel is - * compiled without modules. Consumers of inter_module_xxx routines - * will always work, even when both are built into the kernel, this - * approach removes lots of #ifdefs in mainline code. - */ - -static struct list_head ime_list = LIST_HEAD_INIT(ime_list); -static DEFINE_SPINLOCK(ime_lock); -static int kmalloc_failed; - -struct inter_module_entry { - struct list_head list; - const char *im_name; - struct module *owner; - const void *userdata; -}; - -/** - * inter_module_register - register a new set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * @owner: module that is registering the data, always use THIS_MODULE - * @userdata: pointer to arbitrary userdata to be registered - * - * Description: Check that the im_name has not already been registered, - * complain if it has. For new data, add it to the inter_module_entry - * list. - */ -void inter_module_register(const char *im_name, struct module *owner, const void *userdata) -{ - struct list_head *tmp; - struct inter_module_entry *ime, *ime_new; - - if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { - /* Overloaded kernel, not fatal */ - printk(KERN_ERR - "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", - im_name); - kmalloc_failed = 1; - return; - } - ime_new->im_name = im_name; - ime_new->owner = owner; - ime_new->userdata = userdata; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - spin_unlock(&ime_lock); - kfree(ime_new); - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); - BUG(); - } - } - list_add(&(ime_new->list), &ime_list); - spin_unlock(&ime_lock); -} - -/** - * inter_module_unregister - unregister a set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: Check that the im_name has been registered, complain if - * it has not. For existing data, remove it from the - * inter_module_entry list. - */ -void inter_module_unregister(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - list_del(&(ime->list)); - spin_unlock(&ime_lock); - kfree(ime); - return; - } - } - spin_unlock(&ime_lock); - if (kmalloc_failed) { - printk(KERN_ERR - "inter_module_unregister: no entry for '%s', " - "probably caused by previous kmalloc failure\n", - im_name); - return; - } - else { - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); - BUG(); - } -} - -/** - * inter_module_get - return arbitrary userdata from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, return NULL. - * Try to increment the use count on the owning module, if that fails - * then return NULL. Otherwise return the userdata. - */ -static const void *inter_module_get(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - const void *result = NULL; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (try_module_get(ime->owner)) - result = ime->userdata; - break; - } - } - spin_unlock(&ime_lock); - return(result); -} - -/** - * inter_module_get_request - im get with automatic request_module. - * @im_name: an arbitrary string to identify the data, must be unique - * @modname: module that is expected to register im_name - * - * Description: If inter_module_get fails, do request_module then retry. - */ -const void *inter_module_get_request(const char *im_name, const char *modname) -{ - const void *result = inter_module_get(im_name); - if (!result) { - request_module("%s", modname); - result = inter_module_get(im_name); - } - return(result); -} - -/** - * inter_module_put - release use of data from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, complain, - * otherwise decrement the use count on the owning module. - */ -void inter_module_put(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (ime->owner) - module_put(ime->owner); - spin_unlock(&ime_lock); - return; - } - } - spin_unlock(&ime_lock); - printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); - BUG(); -} - -EXPORT_SYMBOL(inter_module_register); -EXPORT_SYMBOL(inter_module_unregister); -EXPORT_SYMBOL(inter_module_get_request); -EXPORT_SYMBOL(inter_module_put); - -MODULE_LICENSE("GPL"); - diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 9f77f50d814..1dab0ac3f79 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o +obj-y := handle.o manage.o spurious.o resend.o chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 3467097ca61..533068cfb60 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -11,12 +11,14 @@ #include <linux/interrupt.h> #include <linux/delay.h> +#include "internals.h" + /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with * "IRQ_WAITING" cleared and the interrupt disabled. */ -static DECLARE_MUTEX(probe_sem); +static DEFINE_MUTEX(probing_active); /** * probe_irq_on - begin an interrupt autodetect @@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem); */ unsigned long probe_irq_on(void) { - unsigned long val; - irq_desc_t *desc; + struct irq_desc *desc; + unsigned long mask; unsigned int i; - down(&probe_sem); + mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. @@ -40,8 +42,21 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!irq_desc[i].action) - irq_desc[i].handler->startup(i); + if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + /* + * An old-style architecture might still have + * the handle_bad_irq handler there: + */ + compat_irq_chip_set_default_handler(desc); + + /* + * Some chips need to know about probing in + * progress: + */ + if (desc->chip->set_type) + desc->chip->set_type(i, IRQ_TYPE_PROBE); + desc->chip->startup(i); + } spin_unlock_irq(&desc->lock); } @@ -57,9 +72,9 @@ unsigned long probe_irq_on(void) desc = irq_desc + i; spin_lock_irq(&desc->lock); - if (!desc->action) { + if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->handler->startup(i)) + if (desc->chip->startup(i)) desc->status |= IRQ_PENDING; } spin_unlock_irq(&desc->lock); @@ -73,11 +88,11 @@ unsigned long probe_irq_on(void) /* * Now filter out any obviously spurious interrupts */ - val = 0; + mask = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; unsigned int status; + desc = irq_desc + i; spin_lock_irq(&desc->lock); status = desc->status; @@ -85,17 +100,16 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } else if (i < 32) - val |= 1 << i; + mask |= 1 << i; } spin_unlock_irq(&desc->lock); } - return val; + return mask; } - EXPORT_SYMBOL(probe_irq_on); /** @@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) mask = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; + struct irq_desc *desc = irq_desc + i; unsigned int status; spin_lock_irq(&desc->lock); @@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } spin_unlock_irq(&desc->lock); } - up(&probe_sem); + mutex_unlock(&probing_active); return mask & val; } @@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val) int i, irq_found = 0, nr_irqs = 0; for (i = 0; i < NR_IRQS; i++) { - irq_desc_t *desc = irq_desc + i; + struct irq_desc *desc = irq_desc + i; unsigned int status; spin_lock_irq(&desc->lock); @@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val) nr_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); + desc->chip->shutdown(i); } spin_unlock_irq(&desc->lock); } - up(&probe_sem); + mutex_unlock(&probing_active); if (nr_irqs > 1) irq_found = -irq_found; + return irq_found; } - EXPORT_SYMBOL(probe_irq_off); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 00000000000..4a0952d9458 --- /dev/null +++ b/kernel/irq/chip.c @@ -0,0 +1,525 @@ +/* + * linux/kernel/irq/chip.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code, for irq-chip + * based architectures. + * + * Detailed information is available in Documentation/DocBook/genericirq + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> + +#include "internals.h" + +/** + * set_irq_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure + */ +int set_irq_chip(unsigned int irq, struct irq_chip *chip) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); + WARN_ON(1); + return -EINVAL; + } + + if (!chip) + chip = &no_irq_chip; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + irq_chip_set_defaults(chip); + desc->chip = chip; + /* + * For compatibility only: + */ + desc->chip = chip; + spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL(set_irq_chip); + +/** + * set_irq_type - set the irq type for an irq + * @irq: irq number + * @type: interrupt type - see include/linux/interrupt.h + */ +int set_irq_type(unsigned int irq, unsigned int type) +{ + struct irq_desc *desc; + unsigned long flags; + int ret = -ENXIO; + + if (irq >= NR_IRQS) { + printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); + return -ENODEV; + } + + desc = irq_desc + irq; + if (desc->chip->set_type) { + spin_lock_irqsave(&desc->lock, flags); + ret = desc->chip->set_type(irq, type); + spin_unlock_irqrestore(&desc->lock, flags); + } + return ret; +} +EXPORT_SYMBOL(set_irq_type); + +/** + * set_irq_data - set irq type data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data + * + * Set the hardware irq controller data for an irq + */ +int set_irq_data(unsigned int irq, void *data) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install controller data for IRQ%d\n", irq); + return -EINVAL; + } + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); + desc->handler_data = data; + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +EXPORT_SYMBOL(set_irq_data); + +/** + * set_irq_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data + * + * Set the hardware irq chip data for an irq + */ +int set_irq_chip_data(unsigned int irq, void *data) +{ + struct irq_desc *desc = irq_desc + irq; + unsigned long flags; + + if (irq >= NR_IRQS || !desc->chip) { + printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + return -EINVAL; + } + + spin_lock_irqsave(&desc->lock, flags); + desc->chip_data = data; + spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL(set_irq_chip_data); + +/* + * default enable function + */ +static void default_enable(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; +} + +/* + * default disable function + */ +static void default_disable(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + if (!(desc->status & IRQ_DELAYED_DISABLE)) + irq_desc[irq].chip->mask(irq); +} + +/* + * default startup function + */ +static unsigned int default_startup(unsigned int irq) +{ + irq_desc[irq].chip->enable(irq); + + return 0; +} + +/* + * Fixup enable/disable function pointers + */ +void irq_chip_set_defaults(struct irq_chip *chip) +{ + if (!chip->enable) + chip->enable = default_enable; + if (!chip->disable) + chip->disable = default_disable; + if (!chip->startup) + chip->startup = default_startup; + if (!chip->shutdown) + chip->shutdown = chip->disable; + if (!chip->name) + chip->name = chip->typename; +} + +static inline void mask_ack_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask_ack) + desc->chip->mask_ack(irq); + else { + desc->chip->mask(irq); + desc->chip->ack(irq); + } +} + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control + * is necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and + * unmask issues if necessary. + */ +void fastcall +handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + struct irqaction *action; + irqreturn_t action_ret; + const unsigned int cpu = smp_processor_id(); + + spin_lock(&desc->lock); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out_unlock; + + desc->status |= IRQ_INPROGRESS; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out_unlock: + spin_unlock(&desc->lock); +} + +/** + * handle_level_irq - Level type irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Level type interrupts are active as long as the hardware line has + * the active level. This may require to mask the interrupt and unmask + * it after the associated handler has acknowledged the device, so the + * interrupt line is back to inactive. + */ +void fastcall +handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + unsigned int cpu = smp_processor_id(); + struct irqaction *action; + irqreturn_t action_ret; + + spin_lock(&desc->lock); + mask_ack_irq(desc, irq); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + /* + * If its disabled or no action available + * keep it masked and get out of here + */ + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) + goto out; + + desc->status |= IRQ_INPROGRESS; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out: + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); + spin_unlock(&desc->lock); +} + +/** + * handle_fasteoi_irq - irq handler for transparent controllers + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Only a single callback will be issued to the chip: an ->eoi() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void fastcall +handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc, + struct pt_regs *regs) +{ + unsigned int cpu = smp_processor_id(); + struct irqaction *action; + irqreturn_t action_ret; + + spin_lock(&desc->lock); + + if (unlikely(desc->status & IRQ_INPROGRESS)) + goto out; + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + kstat_cpu(cpu).irqs[irq]++; + + /* + * If its disabled or no action available + * keep it masked and get out of here + */ + action = desc->action; + if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + desc->status |= IRQ_PENDING; + goto out; + } + + desc->status |= IRQ_INPROGRESS; + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; +out: + desc->chip->eoi(irq); + + spin_unlock(&desc->lock); +} + +/** + * handle_edge_irq - edge type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Interrupt occures on the falling and/or rising edge of a hardware + * signal. The occurence is latched into the irq controller hardware + * and must be acked in order to be reenabled. After the ack another + * interrupt can happen on the same source even before the first one + * is handled by the assosiacted event handler. If this happens it + * might be necessary to disable (mask) the interrupt depending on the + * controller hardware. This requires to reenable the interrupt inside + * of the loop which handles the interrupts which have arrived while + * the handler was running. If all pending interrupts are handled, the + * loop is left. + */ +void fastcall +handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + const unsigned int cpu = smp_processor_id(); + + spin_lock(&desc->lock); + + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || + !desc->action)) { + desc->status |= (IRQ_PENDING | IRQ_MASKED); + mask_ack_irq(desc, irq); + goto out_unlock; + } + + kstat_cpu(cpu).irqs[irq]++; + + /* Start handling the irq */ + desc->chip->ack(irq); + + /* Mark the IRQ currently in progress.*/ + desc->status |= IRQ_INPROGRESS; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->chip->mask(irq); + goto out_unlock; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely((desc->status & + (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == + (IRQ_PENDING | IRQ_MASKED))) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, regs, action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + spin_lock(&desc->lock); + + } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + + desc->status &= ~IRQ_INPROGRESS; +out_unlock: + spin_unlock(&desc->lock); +} + +#ifdef CONFIG_SMP +/** + * handle_percpu_IRQ - Per CPU local irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * @regs: pointer to a register structure + * + * Per CPU interrupts on SMP machines without locking requirements + */ +void fastcall +handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + irqreturn_t action_ret; + + kstat_this_cpu.irqs[irq]++; + + if (desc->chip->ack) + desc->chip->ack(irq); + + action_ret = handle_IRQ_event(irq, regs, desc->action); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret, regs); + + if (desc->chip->eoi) + desc->chip->eoi(irq); +} + +#endif /* CONFIG_SMP */ + +void +__set_irq_handler(unsigned int irq, + void fastcall (*handle)(unsigned int, irq_desc_t *, + struct pt_regs *), + int is_chained) +{ + struct irq_desc *desc; + unsigned long flags; + + if (irq >= NR_IRQS) { + printk(KERN_ERR + "Trying to install type control for IRQ%d\n", irq); + return; + } + + desc = irq_desc + irq; + + if (!handle) + handle = handle_bad_irq; + + if (is_chained && desc->chip == &no_irq_chip) + printk(KERN_WARNING "Trying to install " + "chained interrupt type for IRQ%d\n", irq); + + spin_lock_irqsave(&desc->lock, flags); + + /* Uninstall? */ + if (handle == handle_bad_irq) { + if (desc->chip != &no_irq_chip) { + desc->chip->mask(irq); + desc->chip->ack(irq); + } + desc->status |= IRQ_DISABLED; + desc->depth = 1; + } + desc->handle_irq = handle; + + if (handle != handle_bad_irq && is_chained) { + desc->status &= ~IRQ_DISABLED; + desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; + desc->depth = 0; + desc->chip->unmask(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +void +set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, + void fastcall (*handle)(unsigned int, + struct irq_desc *, + struct pt_regs *)) +{ + set_irq_chip(irq, chip); + __set_irq_handler(irq, handle, 0); +} + +/* + * Get a descriptive string for the highlevel handler, for + * /proc/interrupts output: + */ +const char * +handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *, + struct pt_regs *)) +{ + if (handle == handle_level_irq) + return "level "; + if (handle == handle_fasteoi_irq) + return "fasteoi"; + if (handle == handle_edge_irq) + return "edge "; + if (handle == handle_simple_irq) + return "simple "; +#ifdef CONFIG_SMP + if (handle == handle_percpu_irq) + return "percpu "; +#endif + if (handle == handle_bad_irq) + return "bad "; + + return NULL; +} diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37d..5a360dd4331 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -1,9 +1,13 @@ /* * linux/kernel/irq/handle.c * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * */ #include <linux/irq.h> @@ -14,11 +18,22 @@ #include "internals.h" +/** + * handle_bad_irq - handle spurious and unhandled irqs + */ +void fastcall +handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs) +{ + print_irq_desc(irq, desc); + kstat_this_cpu.irqs[irq]++; + ack_bad_irq(irq); +} + /* * Linux has a controller-independent interrupt architecture. * Every controller has a 'controller-template', that is used * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the apropriate + * interrupt source is transparently wired to the appropriate * controller. Thus drivers need not be aware of the * interrupt-controller. * @@ -28,41 +43,52 @@ * * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, - .handler = &no_irq_type, - .lock = SPIN_LOCK_UNLOCKED + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = SPIN_LOCK_UNLOCKED, +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif } }; /* - * Generic 'no controller' code + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. */ -static void end_none(unsigned int irq) { } -static void enable_none(unsigned int irq) { } -static void disable_none(unsigned int irq) { } -static void shutdown_none(unsigned int irq) { } -static unsigned int startup_none(unsigned int irq) { return 0; } - -static void ack_none(unsigned int irq) +static void ack_bad(unsigned int irq) { - /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themself. - */ + print_irq_desc(irq, irq_desc + irq); ack_bad_irq(irq); } -struct hw_interrupt_type no_irq_type = { - .typename = "none", - .startup = startup_none, - .shutdown = shutdown_none, - .enable = enable_none, - .disable = disable_none, - .ack = ack_none, - .end = end_none, - .set_affinity = NULL +/* + * NOP functions + */ +static void noop(unsigned int irq) +{ +} + +static unsigned int noop_ret(unsigned int irq) +{ + return 0; +} + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .startup = noop_ret, + .shutdown = noop, + .enable = noop, + .disable = noop, + .ack = ack_bad, + .end = noop, }; /* @@ -73,13 +99,19 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) return IRQ_NONE; } -/* - * Have got an event to handle: +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @regs: pointer to a register structure + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event */ -fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, - struct irqaction *action) +irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, + struct irqaction *action) { - int ret, retval = 0, status = 0; + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); @@ -99,15 +131,22 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, return retval; } -/* - * do_IRQ handles all normal device IRQ's (the special +/** + * __do_IRQ - original all in one highlevel IRQ handler + * @irq: the interrupt number + * @regs: pointer to a register structure + * + * __do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific * handlers). + * + * This is the original x86 implementation which is used for every + * interrupt type. */ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) { - irq_desc_t *desc = irq_desc + irq; - struct irqaction * action; + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; unsigned int status; kstat_this_cpu.irqs[irq]++; @@ -117,16 +156,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) /* * No locking required for CPU-local interrupts: */ - if (desc->handler->ack) - desc->handler->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); action_ret = handle_IRQ_event(irq, regs, desc->action); - desc->handler->end(irq); + desc->chip->end(irq); return 1; } spin_lock(&desc->lock); - if (desc->handler->ack) - desc->handler->ack(irq); + if (desc->chip->ack) + desc->chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -186,7 +225,7 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->handler->end(irq); + desc->chip->end(irq); spin_unlock(&desc->lock); return 1; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46feba63026..08a849a2244 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -4,6 +4,12 @@ extern int noirqdebug; +/* Set default functions for irq_chip structures: */ +extern void irq_chip_set_defaults(struct irq_chip *chip); + +/* Set default handler: */ +extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq); extern void register_handler_proc(unsigned int irq, struct irqaction *action); @@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +/* + * Debugging printout: + */ + +#include <linux/kallsyms.h> + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->chip(): %p, ", desc->chip); + print_symbol("%s\n", (unsigned long)desc->chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_INPROGRESS); + P(IRQ_DISABLED); + P(IRQ_PENDING); + P(IRQ_REPLAY); + P(IRQ_AUTODETECT); + P(IRQ_WAITING); + P(IRQ_LEVEL); + P(IRQ_MASKED); +#ifdef CONFIG_IRQ_PER_CPU + P(IRQ_PER_CPU); +#endif + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); +} + +#undef P + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1279e349953..9eb1d518ee1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1,7 +1,8 @@ /* * linux/kernel/irq/manage.c * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006 Thomas Gleixner * * This file contains driver APIs to the irq subsystem. */ @@ -16,12 +17,6 @@ #ifdef CONFIG_SMP -cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; - -#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) -cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; -#endif - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -42,7 +37,6 @@ void synchronize_irq(unsigned int irq) while (desc->status & IRQ_INPROGRESS) cpu_relax(); } - EXPORT_SYMBOL(synchronize_irq); #endif @@ -60,7 +54,7 @@ EXPORT_SYMBOL(synchronize_irq); */ void disable_irq_nosync(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; if (irq >= NR_IRQS) @@ -69,11 +63,10 @@ void disable_irq_nosync(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); + desc->chip->disable(irq); } spin_unlock_irqrestore(&desc->lock, flags); } - EXPORT_SYMBOL(disable_irq_nosync); /** @@ -90,7 +83,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; if (irq >= NR_IRQS) return; @@ -99,7 +92,6 @@ void disable_irq(unsigned int irq) if (desc->action) synchronize_irq(irq); } - EXPORT_SYMBOL(disable_irq); /** @@ -114,7 +106,7 @@ EXPORT_SYMBOL(disable_irq); */ void enable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; if (irq >= NR_IRQS) @@ -123,17 +115,15 @@ void enable_irq(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { case 0: + printk(KERN_WARNING "Unablanced enable_irq(%d)\n", irq); WARN_ON(1); break; case 1: { unsigned int status = desc->status & ~IRQ_DISABLED; - desc->status = status; - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = status | IRQ_REPLAY; - hw_resend_irq(desc->handler,irq); - } - desc->handler->enable(irq); + /* Prevent probing on this irq: */ + desc->status = status | IRQ_NOPROBE; + check_irq_resend(desc, irq); /* fall-through */ } default: @@ -141,9 +131,29 @@ void enable_irq(unsigned int irq) } spin_unlock_irqrestore(&desc->lock, flags); } - EXPORT_SYMBOL(enable_irq); +/** + * set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode + */ +int set_irq_wake(unsigned int irq, unsigned int on) +{ + struct irq_desc *desc = irq_desc + irq; + unsigned long flags; + int ret = -ENXIO; + + spin_lock_irqsave(&desc->lock, flags); + if (desc->chip->set_wake) + ret = desc->chip->set_wake(irq, on); + spin_unlock_irqrestore(&desc->lock, flags); + return ret; +} +EXPORT_SYMBOL(set_irq_wake); + /* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available @@ -153,7 +163,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irqaction *action; - if (irq >= NR_IRQS) + if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) return 0; action = irq_desc[irq].action; @@ -164,11 +174,22 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return !action; } +void compat_irq_chip_set_default_handler(struct irq_desc *desc) +{ + /* + * If the architecture still has not overriden + * the flow handler then zap the default. This + * should catch incorrect flow-type setting. + */ + if (desc->handle_irq == &handle_bad_irq) + desc->handle_irq = NULL; +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ -int setup_irq(unsigned int irq, struct irqaction * new) +int setup_irq(unsigned int irq, struct irqaction *new) { struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; @@ -178,7 +199,7 @@ int setup_irq(unsigned int irq, struct irqaction * new) if (irq >= NR_IRQS) return -EINVAL; - if (desc->handler == &no_irq_type) + if (desc->chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -200,14 +221,21 @@ int setup_irq(unsigned int irq, struct irqaction * new) /* * The following block of code has to be executed atomically */ - spin_lock_irqsave(&desc->lock,flags); + spin_lock_irqsave(&desc->lock, flags); p = &desc->action; - if ((old = *p) != NULL) { - /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) + old = *p; + if (old) { + /* + * Can't share interrupts unless both agree to and are + * the same type (level, edge, polarity). So both flag + * fields must have SA_SHIRQ set and the bits which + * set the trigger type must match. + */ + if (!((old->flags & new->flags) & SA_SHIRQ) || + ((old->flags ^ new->flags) & SA_TRIGGER_MASK)) goto mismatch; -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) goto mismatch; @@ -222,20 +250,44 @@ int setup_irq(unsigned int irq, struct irqaction * new) } *p = new; -#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) +#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) if (new->flags & SA_PERCPU_IRQ) desc->status |= IRQ_PER_CPU; #endif if (!shared) { - desc->depth = 0; - desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | - IRQ_WAITING | IRQ_INPROGRESS); - if (desc->handler->startup) - desc->handler->startup(irq); - else - desc->handler->enable(irq); + irq_chip_set_defaults(desc->chip); + + /* Setup the type (level, edge polarity) if configured: */ + if (new->flags & SA_TRIGGER_MASK) { + if (desc->chip && desc->chip->set_type) + desc->chip->set_type(irq, + new->flags & SA_TRIGGER_MASK); + else + /* + * SA_TRIGGER_* but the PIC does not support + * multiple flow-types? + */ + printk(KERN_WARNING "setup_irq(%d) SA_TRIGGER" + "set. No set_type function available\n", + irq); + } else + compat_irq_chip_set_default_handler(desc); + + desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | + IRQ_INPROGRESS); + + if (!(desc->status & IRQ_NOAUTOEN)) { + desc->depth = 0; + desc->status &= ~IRQ_DISABLED; + if (desc->chip->startup) + desc->chip->startup(irq); + else + desc->chip->enable(irq); + } else + /* Undo nested disables: */ + desc->depth = 1; } - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; register_irq_proc(irq); @@ -278,10 +330,10 @@ void free_irq(unsigned int irq, void *dev_id) return; desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); + spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { - struct irqaction * action = *p; + struct irqaction *action = *p; if (action) { struct irqaction **pp = p; @@ -295,18 +347,18 @@ void free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day.*/ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->handler->release) - desc->handler->release(irq, dev_id); + if (desc->chip->release) + desc->chip->release(irq, dev_id); #endif if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->handler->shutdown) - desc->handler->shutdown(irq); + if (desc->chip->shutdown) + desc->chip->shutdown(irq); else - desc->handler->disable(irq); + desc->chip->disable(irq); } - spin_unlock_irqrestore(&desc->lock,flags); + spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); /* Make sure it's not being used on another CPU */ @@ -314,12 +366,11 @@ void free_irq(unsigned int irq, void *dev_id) kfree(action); return; } - printk(KERN_ERR "Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); + printk(KERN_ERR "Trying to free free IRQ%d\n", irq); + spin_unlock_irqrestore(&desc->lock, flags); return; } } - EXPORT_SYMBOL(free_irq); /** @@ -353,9 +404,9 @@ EXPORT_SYMBOL(free_irq); */ int request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, const char * devname, void *dev_id) + unsigned long irqflags, const char *devname, void *dev_id) { - struct irqaction * action; + struct irqaction *action; int retval; /* @@ -368,6 +419,8 @@ int request_irq(unsigned int irq, return -EINVAL; if (irq >= NR_IRQS) return -EINVAL; + if (irq_desc[irq].status & IRQ_NOREQUEST) + return -EINVAL; if (!handler) return -EINVAL; @@ -390,6 +443,5 @@ int request_irq(unsigned int irq, return retval; } - EXPORT_SYMBOL(request_irq); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e3..a57ebe9fa6f 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,19 +3,19 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - irq_desc_t *desc = irq_desc + irq; + struct irq_desc *desc = irq_desc + irq; unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->move_irq = 1; - pending_irq_cpumask[irq] = mask; + irq_desc[irq].pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_native_irq(int irq) { + struct irq_desc *desc = irq_desc + irq; cpumask_t tmp; - irq_desc_t *desc = irq_descp(irq); if (likely(!desc->move_irq)) return; @@ -30,15 +30,15 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (likely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) return; - if (!desc->handler->set_affinity) + if (!desc->chip->set_affinity) return; assert_spin_locked(&desc->lock); - cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); + cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -49,14 +49,14 @@ void move_native_irq(int irq) * cause some ioapics to mal-function. * Being paranoid i guess! */ - if (unlikely(!cpus_empty(tmp))) { + if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) - desc->handler->disable(irq); + desc->chip->disable(irq); - desc->handler->set_affinity(irq,tmp); + desc->chip->set_affinity(irq,tmp); if (likely(!(desc->status & IRQ_DISABLED))) - desc->handler->enable(irq); + desc->chip->enable(irq); } - cpus_clear(pending_irq_cpumask[irq]); + cpus_clear(irq_desc[irq].pending_mask); } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce..607c7809ad0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,18 +12,15 @@ #include "internals.h" -static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; +static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -/* - * The /proc/irq/<irq>/smp_affinity values: - */ -static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; - #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); + /* * Save these away for later use. Re-progam when the * interrupt is pending @@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) #else void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { - irq_affinity[irq] = mask_val; - irq_desc[irq].handler->set_affinity(irq, mask_val); + set_balance_irq_affinity(irq, mask_val); + irq_desc[irq].affinity = mask_val; + irq_desc[irq].chip->set_affinity(irq, mask_val); } #endif static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); + int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity); if (count - len < 2) return -EINVAL; @@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, unsigned int irq = (int)(long)data, full_count = count, err; cpumask_t new_value, tmp; - if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) + if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) return -EIO; err = cpumask_parse(buffer, count, new_value); @@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; - if (!irq_dir[irq] || action->dir || !action->name || + if (!irq_desc[irq].dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_dir[irq]); + action->dir = proc_mkdir(name, irq_desc[irq].dir); } #undef MAX_NAMELEN @@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq) char name [MAX_NAMELEN]; if (!root_irq_dir || - (irq_desc[irq].handler == &no_irq_type) || - irq_dir[irq]) + (irq_desc[irq].chip == &no_irq_chip) || + irq_desc[irq].dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_dir[irq] = proc_mkdir(name, root_irq_dir); + irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP { struct proc_dir_entry *entry; /* create /proc/irq/<irq>/smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); if (entry) { entry->nlink = 1; @@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq) entry->read_proc = irq_affinity_read_proc; entry->write_proc = irq_affinity_write_proc; } - smp_affinity_entry[irq] = entry; } #endif } @@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { if (action->dir) - remove_proc_entry(action->dir->name, irq_dir[irq]); + remove_proc_entry(action->dir->name, irq_desc[irq].dir); } void init_irq_proc(void) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 00000000000..872f91ba2ce --- /dev/null +++ b/kernel/irq/resend.c @@ -0,0 +1,78 @@ +/* + * linux/kernel/irq/resend.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner + * + * This file contains the IRQ-resend code + * + * If the interrupt is waiting to be processed, we try to re-run it. + * We can't directly run it from here since the caller might be in an + * interrupt-protected region. Not all irq controller chips can + * retrigger interrupts at the hardware level, so in those cases + * we allow the resending of IRQs via a tasklet. + */ + +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/interrupt.h> + +#include "internals.h" + +#ifdef CONFIG_HARDIRQS_SW_RESEND + +/* Bitmap to handle software resend of interrupts: */ +static DECLARE_BITMAP(irqs_resend, NR_IRQS); + +/* + * Run software resends of IRQ's + */ +static void resend_irqs(unsigned long arg) +{ + struct irq_desc *desc; + int irq; + + while (!bitmap_empty(irqs_resend, NR_IRQS)) { + irq = find_first_bit(irqs_resend, NR_IRQS); + clear_bit(irq, irqs_resend); + desc = irq_desc + irq; + local_irq_disable(); + desc->handle_irq(irq, desc, NULL); + local_irq_enable(); + } +} + +/* Tasklet to handle resend: */ +static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); + +#endif + +/* + * IRQ resend + * + * Is called with interrupts disabled and desc->lock held. + */ +void check_irq_resend(struct irq_desc *desc, unsigned int irq) +{ + unsigned int status = desc->status; + + /* + * Make sure the interrupt is enabled, before resending it: + */ + desc->chip->enable(irq); + + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status &= ~IRQ_PENDING; + desc->status = status | IRQ_REPLAY; + + if (!desc->chip || !desc->chip->retrigger || + !desc->chip->retrigger(irq)) { +#ifdef CONFIG_HARDIRQS_SW_RESEND + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); +#endif + } + } +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec8..b483deed311 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -11,27 +11,25 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> -static int irqfixup; +static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. */ - static int misrouted_irq(int irq, struct pt_regs *regs) { int i; - irq_desc_t *desc; int ok = 0; int work = 0; /* Did we do work for a real IRQ */ - for(i = 1; i < NR_IRQS; i++) { + for (i = 1; i < NR_IRQS; i++) { + struct irq_desc *desc = irq_desc + i; struct irqaction *action; if (i == irq) /* Already tried */ continue; - desc = &irq_desc[i]; + spin_lock(&desc->lock); - action = desc->action; /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* @@ -45,7 +43,9 @@ static int misrouted_irq(int irq, struct pt_regs *regs) } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; + action = desc->action; spin_unlock(&desc->lock); + while (action) { /* Only shared IRQ handlers are safe to call */ if (action->flags & SA_SHIRQ) { @@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) /* * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk + * IRQ clashing with our walk: */ - while ((desc->status & IRQ_PENDING) && action) { /* * Perform real IRQ processing for the IRQ we deferred @@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if(work) - desc->handler->end(i); + if (work && desc->chip && desc->chip->end) + desc->chip->end(i); spin_unlock(&desc->lock); } /* So the caller can adjust the irq error counts */ @@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs) */ static void -__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +__report_bad_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; @@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } dump_stack(); printk(KERN_ERR "handlers:\n"); + action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) } } -static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) +static void +report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { static int count = 100; @@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio } } -void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, - struct pt_regs *regs) +void note_interrupt(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret, struct pt_regs *regs) { - if (action_ret != IRQ_HANDLED) { + if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; - if (action_ret != IRQ_NONE) + if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } @@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, } desc->irq_count++; - if (desc->irq_count < 100000) + if (likely(desc->irq_count < 100000)) return; desc->irq_count = 0; - if (desc->irqs_unhandled > 99900) { + if (unlikely(desc->irqs_unhandled > 99900)) { /* * The interrupt is stuck */ @@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); + desc->depth = 1; + desc->chip->disable(irq); } desc->irqs_unhandled = 0; } -int noirqdebug; +int noirqdebug __read_mostly; int __init noirqdebug_setup(char *str) { noirqdebug = 1; printk(KERN_INFO "IRQ lockup detection disabled\n"); + return 1; } @@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str) irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); + return 1; } diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0..50087ecf337 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; /* * A home grown binary mutex. * Nothing can wait so this mutex is safe to use * in interrupt context :) */ -static int kexec_lock = 0; +static int kexec_lock; asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, @@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { - struct kimage *image; int locked; @@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs) */ locked = xchg(&kexec_lock, 1); if (!locked) { - image = xchg(&kexec_crash_image, NULL); - if (image) { + if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); machine_crash_shutdown(&fixed_regs); - machine_kexec(image); + machine_kexec(kexec_crash_image); } xchg(&kexec_lock, 0); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29a..64aab081153 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,11 +47,17 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; +static atomic_t kprobe_count; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct notifier_block kprobe_page_fault_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) */ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { - struct kprobe *kp; - if (p->break_handler) { - list_for_each_entry_rcu(kp, &old_p->list, list) { - if (kp->break_handler) - return -EEXIST; - } + if (old_p->break_handler) + return -EEXIST; list_add_tail_rcu(&p->list, &old_p->list); + old_p->break_handler = aggr_break_handler; } else list_add_rcu(&p->list, &old_p->list); + if (p->post_handler && !old_p->post_handler) + old_p->post_handler = aggr_post_handler; return 0; } @@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) copy_kprobe(p, ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; - ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; - ap->break_handler = aggr_break_handler; + if (p->post_handler) + ap->post_handler = aggr_post_handler; + if (p->break_handler) + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); @@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); + if (!ret) + atomic_inc(&kprobe_count); goto out; } @@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + if (atomic_add_return(1, &kprobe_count) == \ + (ARCH_INACTIVE_KPROBE_COUNT + 1)) + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); out: @@ -536,14 +549,40 @@ valid_p: kfree(old_p); } arch_remove_kprobe(p); + } else { + mutex_lock(&kprobe_mutex); + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler){ + list_for_each_entry_rcu(list_p, &old_p->list, list){ + if (list_p->post_handler){ + cleanup_p = 2; + break; + } + } + if (cleanup_p == 0) + old_p->post_handler = NULL; + } + mutex_unlock(&kprobe_mutex); } + + /* Call unregister_page_fault_notifier() + * if no probes are active + */ + mutex_lock(&kprobe_mutex); + if (atomic_add_return(-1, &kprobe_count) == \ + ARCH_INACTIVE_KPROBE_COUNT) + unregister_page_fault_notifier(&kprobe_page_fault_nb); + mutex_unlock(&kprobe_mutex); + return; } static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to notified first */ + .priority = 0x7fffffff /* we need to be notified first */ }; + int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ @@ -652,6 +691,7 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + atomic_set(&kprobe_count, 0); err = arch_init_kprobes(); if (!err) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67..9e28478a17a 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/sysfs.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/kexec.h> #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); +#endif /* CONFIG_KEXEC */ + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, +#endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6..24be714b04c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -45,6 +45,13 @@ struct kthread_stop_info static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ int kthread_should_stop(void) { return (kthread_stop_info.k == current); @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) complete(&create->done); } +/** + * kthread_create - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(), kthread_create_on_cpu(). + * + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn can either call do_exit() directly if it is a + * standalone thread for which noone will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create(). + */ void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); @@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. Your threadfn() must not call do_exit() + * itself if you use this function! This can also be called after + * kthread_create() instead of calling wake_up_process(): the thread + * will exit without calling threadfn(). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop(struct task_struct *k) { return kthread_stop_sem(k, NULL); } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_sem - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * @s: semaphore that @k waits on while idle. + * + * Does essentially the same thing as kthread_stop() above, but wakes + * @k by calling up(@s). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -210,5 +269,5 @@ static __init int helper_init(void) return 0; } -core_initcall(helper_init); +core_initcall(helper_init); diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b..99c022ac3d2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,4 +1,4 @@ -/* Rewritten by Rusty Russell, on the backs of many others... +/* Copyright (C) 2002 Richard Henderson Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. @@ -40,9 +40,11 @@ #include <linux/string.h> #include <linux/sched.h> #include <linux/mutex.h> +#include <linux/unwind.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> +#include <linux/license.h> #if 0 #define DEBUGP printk @@ -120,9 +122,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[]; extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const struct kernel_symbol __start___ksymtab_gpl_future[]; extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; +extern const struct kernel_symbol __start___ksymtab_unused[]; +extern const struct kernel_symbol __stop___ksymtab_unused[]; +extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __start___ksymtab_gpl_future[]; +extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; extern const unsigned long __start___kcrctab[]; extern const unsigned long __start___kcrctab_gpl[]; extern const unsigned long __start___kcrctab_gpl_future[]; +extern const unsigned long __start___kcrctab_unused[]; +extern const unsigned long __start___kcrctab_unused_gpl[]; #ifndef CONFIG_MODVERSIONS #define symversion(base, idx) NULL @@ -142,6 +152,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } +static void printk_unused_warning(const char *name) +{ + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", name); + printk(KERN_WARNING "This symbol will go away in the future.\n"); + printk(KERN_WARNING "Please evalute if this is the right api to use, " + "and if it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); +} + /* Find a symbol, return value, crc and module which owns it */ static unsigned long __find_symbol(const char *name, struct module **owner, @@ -184,6 +205,25 @@ static unsigned long __find_symbol(const char *name, return ks->value; } + ks = lookup_symbol(name, __start___ksymtab_unused, + __stop___ksymtab_unused); + if (ks) { + printk_unused_warning(name); + *crc = symversion(__start___kcrctab_unused, + (ks - __start___ksymtab_unused)); + return ks->value; + } + + if (gplok) + ks = lookup_symbol(name, __start___ksymtab_unused_gpl, + __stop___ksymtab_unused_gpl); + if (ks) { + printk_unused_warning(name); + *crc = symversion(__start___kcrctab_unused_gpl, + (ks - __start___ksymtab_unused_gpl)); + return ks->value; + } + /* Now try modules. */ list_for_each_entry(mod, &modules, list) { *owner = mod; @@ -202,6 +242,23 @@ static unsigned long __find_symbol(const char *name, return ks->value; } } + ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); + if (ks) { + printk_unused_warning(name); + *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); + return ks->value; + } + + if (gplok) { + ks = lookup_symbol(name, mod->unused_gpl_syms, + mod->unused_gpl_syms + mod->num_unused_gpl_syms); + if (ks) { + printk_unused_warning(name); + *crc = symversion(mod->unused_gpl_crcs, + (ks - mod->unused_gpl_syms)); + return ks->value; + } + } ks = lookup_symbol(name, mod->gpl_future_syms, (mod->gpl_future_syms + mod->num_gpl_future_syms)); @@ -1051,6 +1108,8 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); + unwind_remove_table(mod->unwind_info, 0); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1248,16 +1307,6 @@ static void layout_sections(struct module *mod, } } -static inline int license_is_gpl_compatible(const char *license) -{ - return (strcmp(license, "GPL") == 0 - || strcmp(license, "GPL v2") == 0 - || strcmp(license, "GPL and additional rights") == 0 - || strcmp(license, "Dual BSD/GPL") == 0 - || strcmp(license, "Dual MIT/GPL") == 0 - || strcmp(license, "Dual MPL/GPL") == 0); -} - static void set_license(struct module *mod, const char *license) { if (!license) @@ -1326,7 +1375,7 @@ int is_exported(const char *name, const struct module *mod) if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; else - if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) + if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; else return 0; @@ -1409,10 +1458,27 @@ static struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; - unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, - exportindex, modindex, obsparmindex, infoindex, gplindex, - crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, - gplfuturecrcindex; + unsigned int i; + unsigned int symindex = 0; + unsigned int strindex = 0; + unsigned int setupindex; + unsigned int exindex; + unsigned int exportindex; + unsigned int modindex; + unsigned int obsparmindex; + unsigned int infoindex; + unsigned int gplindex; + unsigned int crcindex; + unsigned int gplcrcindex; + unsigned int versindex; + unsigned int pcpuindex; + unsigned int gplfutureindex; + unsigned int gplfuturecrcindex; + unsigned int unwindex = 0; + unsigned int unusedindex; + unsigned int unusedcrcindex; + unsigned int unusedgplindex; + unsigned int unusedgplcrcindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1493,15 +1559,22 @@ static struct module *load_module(void __user *umod, exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); + unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused"); + unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl"); crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); + unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); + unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); +#ifdef ARCH_UNWIND_SECTION_NAME + unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); +#endif /* Don't keep modinfo section */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1510,6 +1583,8 @@ static struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif + if (unwindex) + sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -1639,14 +1714,27 @@ static struct module *load_module(void __user *umod, mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / sizeof(*mod->gpl_future_syms); + mod->num_unused_syms = sechdrs[unusedindex].sh_size / + sizeof(*mod->unused_syms); + mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / + sizeof(*mod->unused_gpl_syms); mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; if (gplfuturecrcindex) mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; + mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; + if (unusedcrcindex) + mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; + mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; + if (unusedgplcrcindex) + mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !crcindex) || (mod->num_gpl_syms && !gplcrcindex) || - (mod->num_gpl_future_syms && !gplfuturecrcindex)) { + (mod->num_gpl_future_syms && !gplfuturecrcindex) || + (mod->num_unused_syms && !unusedcrcindex) || + (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); @@ -1738,6 +1826,11 @@ static struct module *load_module(void __user *umod, goto arch_cleanup; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + /* Size of section 0 is 0, so this works well if no unwind info. */ + mod->unwind_info = unwind_add_table(mod, + (void *)sechdrs[unwindex].sh_addr, + sechdrs[unwindex].sh_size); + /* Get rid of temporary copy */ vfree(hdr); @@ -1836,6 +1929,7 @@ sys_init_module(void __user *umod, mod->state = MODULE_STATE_LIVE; /* Drop initial reference. */ module_put(mod); + unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c37695..e38e4bac97c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/delay.h> #include <linux/module.h> +#include <linux/poison.h> #include <linux/spinlock.h> #include <linux/kallsyms.h> #include <linux/interrupt.h> @@ -153,13 +154,13 @@ next: continue; count++; cursor = curr->next; - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n#%03d: ", count); printk_lock(lock, filter ? 0 : 1); goto next; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n"); } @@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, lock held at task exit time!\n", task->comm, task->pid); @@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) printk("exiting task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* @@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", current->comm, current->pid, lock, from, to); @@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) printk("freeing task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* @@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, void debug_mutex_init_waiter(struct mutex_waiter *waiter) { - memset(waiter, 0x11, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); waiter->magic = waiter; INIT_LIST_HEAD(&waiter->list); } @@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_free_waiter(struct mutex_waiter *waiter) { DEBUG_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, 0x22, sizeof(*waiter)); + memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb..a5196c36a5f 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name); -#define debug_spin_lock(lock) \ - do { \ - local_irq_disable(); \ - if (debug_mutex_on) \ - spin_lock(lock); \ - } while (0) - -#define debug_spin_unlock(lock) \ - do { \ - if (debug_mutex_on) \ - spin_unlock(lock); \ - local_irq_enable(); \ - preempt_check_resched(); \ - } while (0) - #define debug_spin_lock_save(lock, flags) \ do { \ local_irq_save(flags); \ @@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); spin_lock(lock); \ } while (0) -#define debug_spin_lock_restore(lock, flags) \ +#define debug_spin_unlock_restore(lock, flags) \ do { \ if (debug_mutex_on) \ spin_unlock(lock); \ @@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); preempt_check_resched(); \ } while (0) -#define spin_lock_mutex(lock) \ +#define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ DEBUG_WARN_ON(in_interrupt()); \ - debug_spin_lock(&debug_mutex_lock); \ + debug_spin_lock_save(&debug_mutex_lock, flags); \ spin_lock(lock); \ DEBUG_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock) \ +#define spin_unlock_mutex(lock, flags) \ do { \ spin_unlock(lock); \ - debug_spin_unlock(&debug_mutex_lock); \ + debug_spin_unlock_restore(&debug_mutex_lock, flags); \ } while (0) #define DEBUG_OFF() \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9e..7043db21bbc 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; + unsigned long flags; debug_mutex_init_waiter(&waiter); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); @@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { mutex_remove_waiter(lock, &waiter, task->thread_info); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return -EINTR; @@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) __set_task_state(task, state); /* didnt get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); schedule(); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); } /* got the lock - rejoice! */ @@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -203,10 +204,11 @@ static fastcall noinline void __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; DEBUG_WARN_ON(lock->owner != current_thread_info()); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); /* * some architectures leave the lock unlocked in the fastpath failure @@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); } /* @@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) static inline int __mutex_trylock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; int prev; - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) @@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); return prev == 1; } diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b67..06918994725 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -9,8 +9,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock) spin_lock(lock) -#define spin_unlock_mutex(lock) spin_unlock(lock) +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4..857b4fa0912 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,24 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config PM_TRACE + bool "Suspend/resume event tracing" + depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL + default n + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the machine, + then reboot it, then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f..e13e7406784 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -231,7 +231,7 @@ static int software_resume(void) late_initcall(software_resume); -static char * pm_disk_modes[] = { +static const char * const pm_disk_modes[] = { [PM_DISK_FIRMWARE] = "firmware", [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a907f0dc56..6d295c77679 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,7 +15,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/pm.h> - +#include <linux/console.h> #include "power.h" @@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state) -static char *pm_states[PM_SUSPEND_MAX] = { +static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", #ifdef CONFIG_SOFTWARE_SUSPEND @@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; - char ** s; + const char * const *s; char *p; int error; int len; diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f2176..57a792982fb 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -55,7 +55,7 @@ struct snapshot_handle { unsigned int page; unsigned int page_offset; unsigned int prev; - struct pbe *pbe; + struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b7..24c96f35423 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -150,6 +150,10 @@ int restore_highmem(void) } return 0; } +#else +static inline unsigned int count_highmem_pages(void) {return 0;} +static inline int save_highmem(void) {return 0;} +static inline int restore_highmem(void) {return 0;} #endif static int pfn_is_nosave(unsigned long pfn) @@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -/** - * On resume it is necessary to trace and eventually free the unsafe - * pages that have been allocated, because they are needed for I/O - * (on x86-64 we likely will "eat" these pages once again while - * creating the temporary page translation tables) - */ - -struct eaten_page { - struct eaten_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct eaten_page *eaten_pages = NULL; - -static void release_eaten_pages(void) -{ - struct eaten_page *p, *q; - - p = eaten_pages; - while (p) { - q = p->next; - /* We don't want swsusp_free() to free this page again */ - ClearPageNosave(virt_to_page(p)); - free_page((unsigned long)p); - p = q; - } - eaten_pages = NULL; -} +static unsigned int unsafe_pages; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. + * used before suspend. * * The unsafe pages are marked with the PG_nosave_free flag - * - * Allocated but unusable (ie eaten) memory pages should be marked - * so that swsusp_free() can release them + * and we count them using unsafe_pages */ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - do { + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) { - /* This is for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - ((struct eaten_page *)res)->next = eaten_pages; - eaten_pages = res; - } - } while (res && PageNosaveFree(virt_to_page(res))); - else - res = (void *)get_zeroed_page(gfp_mask); + } if (res) { SetPageNosave(virt_to_page(res)); SetPageNosaveFree(virt_to_page(res)); @@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) * On each page we set up a list of struct_pbe elements. */ -struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, + int safe_needed) { unsigned int num; struct pbe *pblist, *pbe; @@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist) return -EFAULT; } + unsafe_pages = 0; + return 0; } @@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, } /** - * create_image - use metadata contained in the PBE list + * prepare_image - use metadata contained in the PBE list * pointed to by pagedir_nosave to mark the pages that will * be overwritten in the process of restoring the system - * memory state from the image and allocate memory for - * the image avoiding these pages + * memory state from the image ("unsafe" pages) and allocate + * memory for the image + * + * The idea is to allocate the PBE list first and then + * allocate as many pages as it's needed for the image data, + * but not to assign these pages to the PBEs initially. + * Instead, we just mark them as allocated and create a list + * of "safe" which will be used later */ -static int create_image(struct snapshot_handle *handle) +struct safe_page { + struct safe_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct safe_page *safe_pages; + +static int prepare_image(struct snapshot_handle *handle) { int error = 0; - struct pbe *p, *pblist; + unsigned int nr_pages = nr_copy_pages; + struct pbe *p, *pblist = NULL; p = pagedir_nosave; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); if (pblist) copy_page_backup_list(pblist, p); free_pagedir(p, 0); if (!pblist) error = -ENOMEM; } - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + safe_pages = NULL; + if (!error && nr_pages > unsafe_pages) { + nr_pages -= unsafe_pages; + while (nr_pages--) { + struct safe_page *ptr; + + ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + if (!ptr) { + error = -ENOMEM; + break; + } + if (!PageNosaveFree(virt_to_page(ptr))) { + /* The page is "safe", add it to the list */ + ptr->next = safe_pages; + safe_pages = ptr; + } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(ptr)); + SetPageNosaveFree(virt_to_page(ptr)); + } + } if (!error) { - release_eaten_pages(); pagedir_nosave = pblist; } else { - pagedir_nosave = NULL; handle->pbe = NULL; - nr_copy_pages = 0; - nr_meta_pages = 0; + swsusp_free(); } return error; } +static void *get_buffer(struct snapshot_handle *handle) +{ + struct pbe *pbe = handle->pbe, *last = handle->last_pbe; + struct page *page = virt_to_page(pbe->orig_address); + + if (PageNosave(page) && PageNosaveFree(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the read page + */ + pbe->address = 0; + if (last && last->next) + last->next = NULL; + return (void *)pbe->orig_address; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the read page + */ + pbe->address = (unsigned long)safe_pages; + safe_pages = safe_pages->next; + if (last) + last->next = pbe; + handle->last_pbe = pbe; + return (void *)pbe->address; +} + /** * snapshot_write_next - used for writing the system memory snapshot. * @@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } else if (handle->prev <= nr_meta_pages) { handle->pbe = unpack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) { - error = create_image(handle); + error = prepare_image(handle); if (error) return error; handle->pbe = pagedir_nosave; - handle->buffer = (void *)handle->pbe->address; + handle->last_pbe = NULL; + handle->buffer = get_buffer(handle); } } else { handle->pbe = handle->pbe->next; - handle->buffer = (void *)handle->pbe->address; + handle->buffer = get_buffer(handle); } handle->prev = handle->page; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e..17f669c8301 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else -static int save_highmem(void) { return 0; } -static int restore_highmem(void) { return 0; } -static unsigned int count_highmem_pages(void) { return 0; } +static inline int save_highmem(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +static inline unsigned int count_highmem_pages(void) { return 0; } #endif /** @@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) */ #define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} int swsusp_shrink_memory(void) { @@ -192,15 +198,17 @@ int swsusp_shrink_memory(void) PAGES_FOR_IO; tmp = size; for_each_zone (zone) - if (!is_highmem(zone)) + if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } if (tmp > 0) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(tmp); if (!tmp) return -ENOMEM; pages += tmp; } else if (size > image_size / PAGE_SIZE) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); pages += tmp; } printk("\b%c", p[i++%4]); diff --git a/kernel/printk.c b/kernel/printk.c index 19a95561929..95b7fe17f12 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -24,6 +24,7 @@ #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/config.h> #include <linux/delay.h> @@ -327,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) struct console *con; for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } } @@ -437,6 +440,7 @@ static int printk_time = 1; #else static int printk_time = 0; #endif +module_param(printk_time, int, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { @@ -453,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + /** * printk - print a kernel message * @fmt: format string @@ -566,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (!down_trylock(&console_sem)) { /* - * Some console drivers may assume that per-cpu resources have - * been allocated. So don't allow them to be called by this - * CPU until it is officially up. We shouldn't be calling into - * random console drivers on a CPU which doesn't exist yet.. + * We own the drivers. We can drop the spinlock and + * let release_console_sem() print the text, maybe ... */ + console_locked = 1; printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - if (!down_trylock(&console_sem)) { - console_locked = 1; + /* - * We own the drivers. We can drop the spinlock and let - * release_console_sem() print the text + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ - printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); - console_may_schedule = 0; - release_console_sem(); + if (cpu_online(smp_processor_id()) || have_callable_console()) { + console_may_schedule = 0; + release_console_sem(); + } else { + /* Release by hand to avoid flushing the buffer. */ + console_locked = 0; + up(&console_sem); + } } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -596,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } -out: + preempt_enable(); return printed_len; } diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e50..5a730fdb1a2 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -299,7 +299,7 @@ out: } #ifdef CONFIG_HOTPLUG_CPU -static int profile_cpu_callback(struct notifier_block *info, +static int __devinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e..335c5b932e1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) static int may_attach(struct task_struct *task) { - if (!task->mm) - return -EPERM; + /* May we inspect the given task? + * This check is used both for attaching with ptrace + * and for allowing access to sensitive information in /proc. + * + * ptrace_attach denies several cases that /proc allows + * because setting up the necessary parent/child relationship + * or halting the specified task is impossible. + */ + int dumpable = 0; + /* Don't let security modules deny introspection */ + if (task == current) + return 0; if (((current->uid != task->euid) || (current->uid != task->suid) || (current->uid != task->uid) || @@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm) + dumpable = task->mm->dumpable; + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; return security_ptrace(current, task); @@ -176,6 +188,8 @@ repeat: goto repeat; } + if (!task->mm) + goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; @@ -200,7 +214,7 @@ out: return retval; } -void __ptrace_detach(struct task_struct *child, unsigned int data) +static inline void __ptrace_detach(struct task_struct *child, unsigned int data) { child->exit_code = data; /* .. re-parent .. */ @@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) ptrace_disable(child); write_lock_irq(&tasklist_lock); + /* protect against de_thread()->release_task() */ if (child->ptrace) __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bb..f464f5ae3f1 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -182,6 +182,15 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +/* + * Return the number of RCU batches processed thus far. Useful + * for debug and statistics. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_ctrlblk.completed; +} + static void rcu_barrier_callback(struct rcu_head *notused) { if (atomic_dec_and_test(&rcu_barrier_cpu_count)) @@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int rcu_cpu_notify(struct notifier_block *self, +static int __devinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block rcu_nb = { +static struct notifier_block __devinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; @@ -612,14 +621,6 @@ void synchronize_rcu(void) wait_for_completion(&rcu.completion); } -/* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. - */ -void synchronize_kernel(void) -{ - synchronize_rcu(); -} - module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); @@ -627,7 +628,7 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d1..4d1c3d24712 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update /proc-based torture test facility + * Read-Copy Update module-based torture test facility * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ static int verbose; /* Print more debug info. */ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ +static char *torture_type = "rcu"; /* What to torture. */ module_param(nreaders, int, 0); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); module_param(shuffle_interval, int, 0); MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -#define TORTURE_FLAG "rcutorture: " +module_param(torture_type, charp, 0); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); + +#define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ - do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) #define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) static char printk_buf[4096]; @@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) spin_unlock_bh(&rcu_torture_lock); } -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else - call_rcu(p, rcu_torture_cb); -} - struct rcu_random_state { unsigned long rrs_state; unsigned long rrs_count; @@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) } /* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferredfree)(struct rcu_torture *p); + int (*stats)(char *page); + char *name; +}; +static struct rcu_torture_ops *cur_ops = NULL; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_torture_read_unlock(int idx) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else + cur_ops->deferredfree(rp); +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferredfree = rcu_torture_deferred_free, + .stats = NULL, + .name = "rcu" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferredfree = rcu_bh_torture_deferred_free, + .stats = NULL, + .name = "rcu_bh" +}; + +static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_bh_ops, NULL }; + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) do { schedule_timeout_uninterruptible(1); - if (rcu_batches_completed() == oldbatch) - continue; if ((rp = rcu_torture_alloc()) == NULL) continue; rp->rtort_pipe_count = 0; @@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); + cur_ops->deferredfree(old_rp); } rcu_torture_current_version++; - oldbatch = rcu_batches_completed(); + oldbatch = cur_ops->completed(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); while (!kthread_should_stop()) @@ -246,6 +339,7 @@ static int rcu_torture_reader(void *arg) { int completed; + int idx; DEFINE_RCU_RANDOM(rand); struct rcu_torture *p; int pipe_count; @@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) set_user_nice(current, 19); do { - rcu_read_lock(); - completed = rcu_batches_completed(); + idx = cur_ops->readlock(); + completed = cur_ops->completed(); p = rcu_dereference(rcu_torture_current); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule_timeout_interruptible(HZ); continue; } @@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) pipe_count = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_count)[pipe_count]; - completed = rcu_batches_completed() - completed; + completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } ++__get_cpu_var(rcu_torture_batch)[completed]; preempt_enable(); - rcu_read_unlock(); + cur_ops->readunlock(idx); schedule(); } while (!kthread_should_stop() && !fullstop); VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); @@ -311,7 +405,7 @@ rcu_torture_printk(char *page) if (pipesummary[i] != 0) break; } - cnt += sprintf(&page[cnt], "rcutorture: "); + cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " "rtmbe: %d", @@ -324,7 +418,7 @@ rcu_torture_printk(char *page) atomic_read(&n_rcu_torture_mberror)); if (atomic_read(&n_rcu_torture_mberror) != 0) cnt += sprintf(&page[cnt], " !!!"); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); if (i > 1) { cnt += sprintf(&page[cnt], "!!! "); atomic_inc(&n_rcu_torture_error); @@ -332,17 +426,19 @@ rcu_torture_printk(char *page) cnt += sprintf(&page[cnt], "Reader Pipe: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\nrcutorture: "); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); cnt += sprintf(&page[cnt], "Free-Block Circulation: "); for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { cnt += sprintf(&page[cnt], " %d", atomic_read(&rcu_torture_wcount[i])); } cnt += sprintf(&page[cnt], "\n"); + if (cur_ops->stats != NULL) + cnt += cur_ops->stats(&page[cnt]); return cnt; } @@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) static inline void rcu_torture_print_module_parms(char *tag) { - printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " + printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " "shuffle_interval = %d\n", - tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, - shuffle_interval); + torture_type, tag, nrealreaders, stat_interval, verbose, + test_no_idle_hz, shuffle_interval); } static void @@ -493,6 +589,9 @@ rcu_torture_cleanup(void) rcu_barrier(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (cur_ops->cleanup != NULL) + cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms("End of test: FAILURE"); else @@ -508,6 +607,20 @@ rcu_torture_init(void) /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) { + break; + } + } + if (cur_ops == NULL) { + printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", + torture_type); + return (-EINVAL); + } + if (cur_ops->init != NULL) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + if (nreaders >= 0) nrealreaders = nreaders; else diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a..bf1130d81b7 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -23,20 +23,18 @@ struct resource ioport_resource = { .name = "PCI IO", - .start = 0x0000, + .start = 0, .end = IO_SPACE_LIMIT, .flags = IORESOURCE_IO, }; - EXPORT_SYMBOL(ioport_resource); struct resource iomem_resource = { .name = "PCI mem", - .start = 0UL, - .end = ~0UL, + .start = 0, + .end = -1, .flags = IORESOURCE_MEM, }; - EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); @@ -83,10 +81,10 @@ static int r_show(struct seq_file *m, void *v) for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; - seq_printf(m, "%*s%0*lx-%0*lx : %s\n", + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, r->start, - width, r->end, + width, (unsigned long long) r->start, + width, (unsigned long long) r->end, r->name ? r->name : "<BAD>"); return 0; } @@ -151,8 +149,8 @@ __initcall(ioresources_init); /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { - unsigned long start = new->start; - unsigned long end = new->end; + resource_size_t start = new->start; + resource_size_t end = new->end; struct resource *tmp, **p; if (end < start) @@ -232,15 +230,52 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags. + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +int find_next_system_ram(struct resource *res) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (p->start > end) { + p = NULL; + break; + } + if (p->start >= start) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + res->start = p->start; + res->end = p->end; + return 0; +} +#endif + /* * Find empty slot in the resource tree given range and alignment. */ static int find_resource(struct resource *root, struct resource *new, - unsigned long size, - unsigned long min, unsigned long max, - unsigned long align, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, void (*alignf)(void *, struct resource *, - unsigned long, unsigned long), + resource_size_t, resource_size_t), void *alignf_data) { struct resource *this = root->child; @@ -282,11 +317,10 @@ static int find_resource(struct resource *root, struct resource *new, * Allocate empty slot in the resource tree given range and alignment. */ int allocate_resource(struct resource *root, struct resource *new, - unsigned long size, - unsigned long min, unsigned long max, - unsigned long align, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, void (*alignf)(void *, struct resource *, - unsigned long, unsigned long), + resource_size_t, resource_size_t), void *alignf_data) { int err; @@ -378,10 +412,10 @@ EXPORT_SYMBOL(insert_resource); * arguments. Returns -EBUSY if it can't fit. Existing children of * the resource are assumed to be immutable. */ -int adjust_resource(struct resource *res, unsigned long start, unsigned long size) +int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { struct resource *tmp, *parent = res->parent; - unsigned long end = start + size - 1; + resource_size_t end = start + size - 1; int result = -EBUSY; write_lock(&resource_lock); @@ -428,7 +462,9 @@ EXPORT_SYMBOL(adjust_resource); * * Release-region releases a matching busy region. */ -struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) +struct resource * __request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name) { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); @@ -464,7 +500,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start, EXPORT_SYMBOL(__request_region); -int __check_region(struct resource *parent, unsigned long start, unsigned long n) +int __check_region(struct resource *parent, resource_size_t start, + resource_size_t n) { struct resource * res; @@ -479,10 +516,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n EXPORT_SYMBOL(__check_region); -void __release_region(struct resource *parent, unsigned long start, unsigned long n) +void __release_region(struct resource *parent, resource_size_t start, + resource_size_t n) { struct resource **p; - unsigned long end; + resource_size_t end; p = &parent->child; end = start + n - 1; @@ -511,7 +549,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon write_unlock(&resource_lock); - printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); + printk(KERN_WARNING "Trying to free nonexistent resource " + "<%016llx-%016llx>\n", (unsigned long long)start, + (unsigned long long)end); } EXPORT_SYMBOL(__release_region); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 00000000000..4aa8a2c9f45 --- /dev/null +++ b/kernel/rtmutex-debug.c @@ -0,0 +1,513 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Copyright (C) 2006 Esben Nielsen + * Copyright (C) 2006 Kihon Technologies Inc., + * Steven Rostedt <rostedt@goodmis.org> + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/plist.h> +#include <linux/fs.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +# define TRACE_WARN_ON(x) WARN_ON(x) +# define TRACE_BUG_ON(x) BUG_ON(x) + +# define TRACE_OFF() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + if (spin_is_locked(¤t->pi_lock)) \ + spin_unlock(¤t->pi_lock); \ + if (spin_is_locked(¤t->held_list_lock)) \ + spin_unlock(¤t->held_list_lock); \ + } \ +} while (0) + +# define TRACE_OFF_NOLOCK() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + } \ +} while (0) + +# define TRACE_BUG_LOCKED() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +# define TRACE_WARN_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +# define TRACE_BUG_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) \ + TRACE_BUG_LOCKED(); \ +} while (0) + +#ifdef CONFIG_SMP +# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) +#else +# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) +#endif + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +int rt_trace_on = 1; + +void deadlock_trace_off(void) +{ + rt_trace_on = 0; +} + +static void printk_task(task_t *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk("<none>"); +} + +static void printk_task_short(task_t *p) +{ + if (p) + printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); + else + printk("<none>"); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); + printk(".. held by: "); + printk_task(rt_mutex_owner(lock)); + printk("\n"); + } + if (rt_mutex_owner(lock)) { + printk("... acquired at: "); + print_symbol("%s\n", lock->acquire_ip); + } +} + +static void printk_waiter(struct rt_mutex_waiter *w) +{ + printk("-------------------------\n"); + printk("| waiter struct %p:\n", w); + printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, + w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, + w->list_entry.prio); + printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", + w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, + w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, + w->pi_list_entry.prio); + printk("\n| lock:\n"); + printk_lock(w->lock, 1); + printk("| w->ti->task:\n"); + printk_task(w->task); + printk("| blocked at: "); + print_symbol("%s\n", w->ip); + printk("-------------------------\n"); +} + +static void show_task_locks(task_t *p) +{ + switch (p->state) { + case TASK_RUNNING: printk("R"); break; + case TASK_INTERRUPTIBLE: printk("S"); break; + case TASK_UNINTERRUPTIBLE: printk("D"); break; + case TASK_STOPPED: printk("T"); break; + case EXIT_ZOMBIE: printk("Z"); break; + case EXIT_DEAD: printk("X"); break; + default: printk("?"); break; + } + printk_task(p); + if (p->pi_blocked_on) { + struct rt_mutex *lock = p->pi_blocked_on->lock; + + printk(" blocked on:"); + printk_lock(lock, 1); + } else + printk(" (not blocked)\n"); +} + +void rt_mutex_show_held_locks(task_t *task, int verbose) +{ + struct list_head *curr, *cursor = NULL; + struct rt_mutex *lock; + task_t *t; + unsigned long flags; + int count = 0; + + if (!rt_trace_on) + return; + + if (verbose) { + printk("------------------------------\n"); + printk("| showing all locks held by: | ("); + printk_task_short(task); + printk("):\n"); + printk("------------------------------\n"); + } + +next: + spin_lock_irqsave(&task->held_list_lock, flags); + list_for_each(curr, &task->held_list_head) { + if (cursor && curr != cursor) + continue; + lock = list_entry(curr, struct rt_mutex, held_list_entry); + t = rt_mutex_owner(lock); + WARN_ON(t != task); + count++; + cursor = curr->next; + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n#%03d: ", count); + printk_lock(lock, 0); + goto next; + } + spin_unlock_irqrestore(&task->held_list_lock, flags); + + printk("\n"); +} + +void rt_mutex_show_all_locks(void) +{ + task_t *g, *p; + int count = 10; + int unlock = 1; + + printk("\n"); + printk("----------------------\n"); + printk("| showing all tasks: |\n"); + printk("----------------------\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } + if (count != 10) + printk(" locked it.\n"); + + do_each_thread(g, p) { + show_task_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + + printk("-----------------------------------------\n"); + printk("| showing all locks held in the system: |\n"); + printk("-----------------------------------------\n"); + + do_each_thread(g, p) { + rt_mutex_show_held_locks(p, 0); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} + +void rt_mutex_debug_check_no_locks_held(task_t *task) +{ + struct rt_mutex_waiter *w; + struct list_head *curr; + struct rt_mutex *lock; + + if (!rt_trace_on) + return; + if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { + printk("BUG: PI priority boost leaked!\n"); + printk_task(task); + printk("\n"); + } + if (list_empty(&task->held_list_head)) + return; + + spin_lock(&task->pi_lock); + plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { + TRACE_OFF(); + + printk("hm, PI interest held at exit time? Task:\n"); + printk_task(task); + printk_waiter(w); + return; + } + spin_unlock(&task->pi_lock); + + list_for_each(curr, &task->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + + printk("BUG: %s/%d, lock held at task exit time!\n", + task->comm, task->pid); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != task) + printk("exiting task is not even the owner??\n"); + } +} + +int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) +{ + const void *to = from + len; + struct list_head *curr; + struct rt_mutex *lock; + unsigned long flags; + void *lock_addr; + + if (!rt_trace_on) + return 0; + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_for_each(curr, ¤t->held_list_head) { + lock = list_entry(curr, struct rt_mutex, held_list_entry); + lock_addr = lock; + if (lock_addr < from || lock_addr >= to) + continue; + TRACE_OFF(); + + printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", + current->comm, current->pid, lock, from, to); + dump_stack(); + printk_lock(lock, 1); + if (rt_mutex_owner(lock) != current) + printk("freeing task is not even the owner??\n"); + return 1; + } + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + return 0; +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ + WARN_ON(!plist_head_empty(&task->pi_waiters)); + WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, + struct rt_mutex *lock) +{ + struct task_struct *task; + + if (!rt_trace_on || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); + if (task && task != current) { + act_waiter->deadlock_task_pid = task->pid; + act_waiter->deadlock_lock = lock; + } +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ + struct task_struct *task; + + if (!waiter->deadlock_lock || !rt_trace_on) + return; + + task = find_task_by_pid(waiter->deadlock_task_pid); + if (!task) + return; + + TRACE_OFF_NOLOCK(); + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task->pid, current->comm, current->pid); + + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, current->pid); + printk_lock(waiter->lock, 1); + + printk("... trying at: "); + print_symbol("%s\n", waiter->ip); + + printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); + printk_lock(waiter->deadlock_lock, 1); + + rt_mutex_show_held_locks(current, 1); + rt_mutex_show_held_locks(task, 1); + + printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, current->pid); + dump_stack(); + rt_mutex_show_all_locks(); + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); + local_irq_disable(); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, ¤t->held_list_head); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(¤t->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(¤t->held_list_lock, flags); + } +} + +void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__) +{ + unsigned long flags; + + if (rt_trace_on) { + TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&powner->held_list_lock, flags); + list_add_tail(&lock->held_list_entry, &powner->held_list_head); + spin_unlock_irqrestore(&powner->held_list_lock, flags); + + lock->acquire_ip = ip; + } +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + unsigned long flags; + + if (rt_trace_on) { + struct task_struct *owner = rt_mutex_owner(lock); + + TRACE_WARN_ON_LOCKED(!owner); + TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); + + spin_lock_irqsave(&owner->held_list_lock, flags); + list_del_init(&lock->held_list_entry); + spin_unlock_irqrestore(&owner->held_list_lock, flags); + } +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + memset(waiter, 0x11, sizeof(*waiter)); + plist_node_init(&waiter->list_entry, MAX_PRIO); + plist_node_init(&waiter->pi_list_entry, MAX_PRIO); +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); + TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + TRACE_WARN_ON(waiter->task); + memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + void *addr = lock; + + if (rt_trace_on) { + rt_mutex_debug_check_no_locks_freed(addr, + sizeof(struct rt_mutex)); + INIT_LIST_HEAD(&lock->held_list_entry); + lock->name = name; + } +} + +void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 00000000000..7612fbc62d7 --- /dev/null +++ b/kernel/rtmutex-debug.h @@ -0,0 +1,37 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +#define __IP_DECL__ , unsigned long ip +#define __IP__ , ip +#define __RET_IP__ , (unsigned long)__builtin_return_address(0) + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner __IP_DECL__); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 00000000000..e82c2f84824 --- /dev/null +++ b/kernel/rtmutex-tester.c @@ -0,0 +1,440 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + */ +#include <linux/config.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/spinlock.h> +#include <linux/sysdev.h> +#include <linux/timer.h> + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS 8 +#define MAX_RT_TEST_MUTEXES 8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { + int opcode; + int opdata; + int mutexes[MAX_RT_TEST_MUTEXES]; + int bkl; + int event; + struct sys_device sysdev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static task_t *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { + RTTEST_NOP = 0, + RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ + RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ + RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ + RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ + RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ + RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ + RTTEST_LOCKBKL, /* 9 Lock BKL */ + RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ + RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ + RTTEST_RESET = 99, /* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ + int i, id, ret = -EINVAL; + + switch(td->opcode) { + + case RTTEST_NOP: + return 0; + + case RTTEST_LOCKCONT: + td->mutexes[td->opdata] = 1; + td->event = atomic_add_return(1, &rttest_event); + return 0; + + case RTTEST_RESET: + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { + if (td->mutexes[i] == 4) { + rt_mutex_unlock(&mutexes[i]); + td->mutexes[i] = 0; + } + } + + if (!lockwakeup && td->bkl == 4) { + unlock_kernel(); + td->bkl = 0; + } + return 0; + + case RTTEST_RESETEVENT: + atomic_set(&rttest_event, 0); + return 0; + + default: + if (lockwakeup) + return ret; + } + + switch(td->opcode) { + + case RTTEST_LOCK: + case RTTEST_LOCKNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_lock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 4; + return 0; + + case RTTEST_LOCKINT: + case RTTEST_LOCKINTNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + ret = rt_mutex_lock_interruptible(&mutexes[id], 0); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = ret ? 0 : 4; + return ret ? -EINTR : 0; + + case RTTEST_UNLOCK: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) + return ret; + + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_unlock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 0; + return 0; + + case RTTEST_LOCKBKL: + if (td->bkl) + return 0; + td->bkl = 1; + lock_kernel(); + td->bkl = 4; + return 0; + + case RTTEST_UNLOCKBKL: + if (td->bkl != 4) + break; + unlock_kernel(); + td->bkl = 0; + return 0; + + default: + break; + } + return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ + int tid, op, dat; + struct test_thread_data *td; + + /* We have to lookup the task */ + for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { + if (threads[tid] == current) + break; + } + + BUG_ON(tid == MAX_RT_TEST_THREADS); + + td = &thread_data[tid]; + + op = td->opcode; + dat = td->opdata; + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + break; + + if (td->mutexes[dat] != 1) + break; + + td->mutexes[dat] = 2; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKBKL: + default: + break; + } + + schedule(); + + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 3; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 1; + td->event = atomic_add_return(1, &rttest_event); + return; + + case RTTEST_LOCKBKL: + return; + default: + return; + } + + td->opcode = 0; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + int ret; + + set_current_state(TASK_RUNNING); + ret = handle_op(td, 1); + set_current_state(TASK_INTERRUPTIBLE); + if (td->opcode == RTTEST_LOCKCONT) + break; + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + } + + /* Restore previous command and data */ + td->opcode = op; + td->opdata = dat; +} + +static int test_func(void *data) +{ + struct test_thread_data *td = data; + int ret; + + current->flags |= PF_MUTEX_TESTER; + allow_signal(SIGHUP); + + for(;;) { + + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + set_current_state(TASK_RUNNING); + ret = handle_op(td, 0); + set_current_state(TASK_INTERRUPTIBLE); + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + + if (signal_pending(current)) + flush_signals(current); + + if(kthread_should_stop()) + break; + } + return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev: thread reference + * @buf: command for actual step + * @count: length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, + size_t count) +{ + struct sched_param schedpar; + struct test_thread_data *td; + char cmdbuf[32]; + int op, dat, tid, ret; + + td = container_of(dev, struct test_thread_data, sysdev); + tid = td->sysdev.id; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(cmdbuf)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + memcpy(cmdbuf, buf, count); + cmdbuf[count] = 0; + + if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) + return -EINVAL; + + switch (op) { + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); + if (ret) + return ret; + set_user_nice(current, 0); + break; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = dat; + ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); + if (ret) + return ret; + break; + + case RTTEST_SIGNAL: + send_sig(SIGHUP, threads[tid], 0); + break; + + default: + if (td->opcode > 0) + return -EBUSY; + td->opdata = dat; + td->opcode = op; + wake_up_process(threads[tid]); + } + + return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev: thread to query + * @buf: char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) +{ + struct test_thread_data *td; + char *curr = buf; + task_t *tsk; + int i; + + td = container_of(dev, struct test_thread_data, sysdev); + tsk = threads[td->sysdev.id]; + + spin_lock(&rttest_lock); + + curr += sprintf(curr, + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + td->opcode, td->event, tsk->state, + (MAX_RT_PRIO - 1) - tsk->prio, + (MAX_RT_PRIO - 1) - tsk->normal_prio, + tsk->pi_blocked_on, td->bkl); + + for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) + curr += sprintf(curr, "%d", td->mutexes[i]); + + spin_unlock(&rttest_lock); + + curr += sprintf(curr, ", T: %p, R: %p\n", tsk, + mutexes[td->sysdev.id].owner); + + return curr - buf; +} + +static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); +static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); + +static struct sysdev_class rttest_sysclass = { + set_kset_name("rttest"), +}; + +static int init_test_thread(int id) +{ + thread_data[id].sysdev.cls = &rttest_sysclass; + thread_data[id].sysdev.id = id; + + threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); + if (IS_ERR(threads[id])) + return PTR_ERR(threads[id]); + + return sysdev_register(&thread_data[id].sysdev); +} + +static int init_rttest(void) +{ + int ret, i; + + spin_lock_init(&rttest_lock); + + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) + rt_mutex_init(&mutexes[i]); + + ret = sysdev_class_register(&rttest_sysclass); + if (ret) + return ret; + + for (i = 0; i < MAX_RT_TEST_THREADS; i++) { + ret = init_test_thread(i); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + if (ret) + break; + } + + printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + + return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000000..45d61016da5 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,990 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/timer.h> + +#include "rtmutex_common.h" + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 + * are used to keep track of the "owner is pending" and "lock has + * waiters" state. + * + * owner bit1 bit0 + * NULL 0 0 lock is free (fast acquire possible) + * NULL 0 1 invalid state + * NULL 1 0 Transitional State* + * NULL 1 1 invalid state + * taskpointer 0 0 lock is held (fast release possible) + * taskpointer 0 1 task is pending owner + * taskpointer 1 0 lock is held and has waiters + * taskpointer 1 1 task is pending owner and lock has more waiters + * + * Pending ownership is assigned to the top (highest priority) + * waiter of the lock, when the lock is released. The thread is woken + * up and can now take the lock. Until the lock is taken (bit 0 + * cleared) a competing higher priority thread can steal the lock + * which puts the woken up thread back on the waiters list. + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 and 1 of lock->owner are 0. + * + * (*) There's a small time where the owner can be NULL and the + * "lock has waiters" bit is set. This can happen when grabbing the lock. + * To prevent a cmpxchg of the owner releasing the lock, we need to set this + * bit before looking at the lock, hence the reason this is a transitional + * state. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, + unsigned long mask) +{ + unsigned long val = (unsigned long)owner | mask; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(task_t *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task + __IP_DECL__) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + top_task->comm, top_task->pid); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter || !waiter->task) + goto out_unlock_pi; + + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!spin_trylock(&lock->wait_lock)) { + spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + spin_unlock_irqrestore(&task->pi_lock, flags); + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + return ret; +} + +/* + * Optimization: check if we can steal the lock from the + * assigned pending owner [which might not have taken the + * lock yet]: + */ +static inline int try_to_steal_lock(struct rt_mutex *lock) +{ + struct task_struct *pendowner = rt_mutex_owner(lock); + struct rt_mutex_waiter *next; + unsigned long flags; + + if (!rt_mutex_owner_pending(lock)) + return 0; + + if (pendowner == current) + return 1; + + spin_lock_irqsave(&pendowner->pi_lock, flags); + if (current->prio >= pendowner->prio) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 0; + } + + /* + * Check if a waiter is enqueued on the pending owners + * pi_waiters list. Remove it and readjust pending owners + * priority. + */ + if (likely(!rt_mutex_has_waiters(lock))) { + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + return 1; + } + + /* No chain handling, pending owner is not blocked on anything: */ + next = rt_mutex_top_waiter(lock); + plist_del(&next->pi_list_entry, &pendowner->pi_waiters); + __rt_mutex_adjust_prio(pendowner); + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + /* + * We are going to steal the lock and a waiter was + * enqueued on the pending owners pi_waiters queue. So + * we have to enqueue this waiter into + * current->pi_waiters list. This covers the case, + * where current is boosted because it holds another + * lock and gets unboosted because the booster is + * interrupted, so we would delay a waiter with higher + * priority as current->normal_prio. + * + * Note: in the rare case of a SCHED_OTHER task changing + * its priority and thus stealing the lock, next->task + * might be current: + */ + if (likely(next->task != current)) { + spin_lock_irqsave(¤t->pi_lock, flags); + plist_add(&next->pi_list_entry, ¤t->pi_waiters); + __rt_mutex_adjust_prio(current); + spin_unlock_irqrestore(¤t->pi_lock, flags); + } + return 1; +} + +/* + * Try to take an rt-mutex + * + * This fails + * - when the lock has a real owner + * - when a different pending owner exists and has higher priority than current + * + * Must be called with lock->wait_lock held. + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + return 0; + + /* We got the lock. */ + debug_rt_mutex_lock(lock __IP__); + + rt_mutex_set_owner(lock, current, 0); + + rt_mutex_deadlock_account_lock(lock, current); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + int detect_deadlock + __IP_DECL__) +{ + struct rt_mutex_waiter *top_waiter = waiter; + task_t *owner = rt_mutex_owner(lock); + int boost = 0, res; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + __rt_mutex_adjust_prio(current); + waiter->task = current; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, current->prio); + plist_node_init(&waiter->pi_list_entry, current->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + current->pi_blocked_on = waiter; + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { + spin_lock_irqsave(&owner->pi_lock, flags); + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + if (!boost) + return 0; + + spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, + current __IP__); + + spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and from + * the lock waiter list. Set it as pending owner. Then wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + struct task_struct *pendowner; + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + plist_del(&waiter->list_entry, &lock->wait_list); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + pendowner = waiter->task; + waiter->task = NULL; + + rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + + spin_unlock_irqrestore(¤t->pi_lock, flags); + + /* + * Clear the pi_blocked_on variable and enqueue a possible + * waiter into the pi_waiters list of the pending owner. This + * prevents that in case the pending owner gets unboosted a + * waiter with higher priority than pending-owner->normal_prio + * is blocked on the unboosted (pending) owner. + */ + spin_lock_irqsave(&pendowner->pi_lock, flags); + + WARN_ON(!pendowner->pi_blocked_on); + WARN_ON(pendowner->pi_blocked_on != waiter); + WARN_ON(pendowner->pi_blocked_on->lock != lock); + + pendowner->pi_blocked_on = NULL; + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &pendowner->pi_waiters); + } + spin_unlock_irqrestore(&pendowner->pi_lock, flags); + + wake_up_process(pendowner); +} + +/* + * Remove a waiter from a lock + * + * Must be called with lock->wait_lock held + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter __IP_DECL__) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + int boost = 0; + task_t *owner = rt_mutex_owner(lock); + unsigned long flags; + + spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->task = NULL; + current->pi_blocked_on = NULL; + spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (first && owner != current) { + + spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) { + boost = 1; + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + } + spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!boost) + return; + + spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); + + spin_lock(&lock->wait_lock); +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || waiter->list_entry.prio == task->prio) { + spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + spin_unlock_irqrestore(&task->pi_lock, flags); + + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + waiter.task = NULL; + + spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock __IP__)) { + spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) + hrtimer_start(&timeout->timer, timeout->timer.expires, + HRTIMER_ABS); + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock __IP__)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter.task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter.task) { + ret = task_blocks_on_rt_mutex(lock, &waiter, + detect_deadlock __IP__); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter.task)) + continue; + + if (unlikely(ret)) + break; + } + + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(&waiter); + + if (waiter.task) + schedule_rt_mutex(lock); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter.task)) + remove_waiter(lock, &waiter __IP__); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + /* + * Readjust priority, when we did not get the lock. We might + * have been the pending owner and boosted. Since we did not + * take the lock, the PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) +{ + int ret = 0; + + spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock __IP__); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock __IP_DECL__)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock __RET_IP__); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/*** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list, &lock->wait_lock); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); + rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + return NULL; + + return rt_mutex_top_waiter(lock)->task; +} diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000000..1e0fca13ff7 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,29 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define __IP_DECL__ +#define __IP__ +#define __RET_IP__ +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000000..9c75856e791 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,123 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include <linux/rtmutex.h> + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + pid_t deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_OWNER_PENDING 1UL +#define RT_MUTEX_HAS_WAITERS 2UL +#define RT_MUTEX_OWNER_MASKALL 3UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) +{ + return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +#endif diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7..2629c1711fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -168,15 +168,21 @@ */ #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) -static unsigned int task_timeslice(task_t *p) +static unsigned int static_prio_timeslice(int static_prio) { - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); + return SCALE_PRIO(DEF_TIMESLICE, static_prio); } + +static inline unsigned int task_timeslice(task_t *p) +{ + return static_prio_timeslice(p->static_prio); +} + #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) @@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p) * These are the runqueue data structures: */ -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - typedef struct runqueue runqueue_t; struct prio_array { unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; + DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ struct list_head queue[MAX_PRIO]; }; @@ -209,6 +213,7 @@ struct runqueue { * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; + unsigned long raw_weighted_load; #ifdef CONFIG_SMP unsigned long cpu_load[3]; #endif @@ -239,7 +244,6 @@ struct runqueue { task_t *migration_thread; struct list_head migration_queue; - int cpu; #endif #ifdef CONFIG_SCHEDSTATS @@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline runqueue_t *__task_rq_lock(task_t *p) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + +/* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) __acquires(rq->lock) { struct runqueue *rq; @@ -371,6 +394,12 @@ repeat_lock_task: return rq; } +static inline void __task_rq_unlock(runqueue_t *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) __releases(rq->lock) { @@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) } /* - * effective_prio - return the priority that is based on the static + * __normal_prio - return the priority that is based on the static * priority but is modified by bonuses/penalties. * * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] @@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) * * Both properties are important to certain workloads. */ -static int effective_prio(task_t *p) + +static inline int __normal_prio(task_t *p) { int bonus, prio; - if (rt_task(p)) - return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; @@ -665,6 +692,106 @@ static int effective_prio(task_t *p) } /* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +/* + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification + */ +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static void set_load_weight(task_t *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load += p->load_weight; +} + +static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) +{ + rq->raw_weighted_load -= p->load_weight; +} + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + inc_raw_weighted_load(rq, p); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + dec_raw_weighted_load(rq, p); +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(task_t *p) +{ + int prio; + + if (has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(task_t *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/* * __activate_task - move a task to the runqueue. */ static void __activate_task(task_t *p, runqueue_t *rq) @@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) if (batch_task(p)) target = rq->expired; enqueue_task(p, target); - rq->nr_running++; + inc_nr_running(p, rq); } /* @@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - rq->nr_running++; + inc_nr_running(p, rq); } +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; + unsigned long sleep_time = now - p->timestamp; if (batch_task(p)) sleep_time = 0; - else { - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - } if (likely(sleep_time > 0)) { /* - * User tasks that sleep a long time are categorised as - * idle. They will only have their sleep_avg increased to a - * level that makes them just interactive priority to stay - * active yet prevent them suddenly becoming cpu hogs and - * starving other processes. + * This ceiling is set to the lowest priority that would allow + * a task to be reinserted into the active array on timeslice + * completion. */ - if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { - unsigned long ceiling; + unsigned long ceiling = INTERACTIVE_SLEEP(p); - ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - if (p->sleep_avg < ceiling) - p->sleep_avg = ceiling; + if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { + /* + * Prevents user tasks from achieving best priority + * with one single large enough sleep. + */ + p->sleep_avg = ceiling; + /* + * Using INTERACTIVE_SLEEP() as a ceiling places a + * nice(0) task 1ms sleep away from promotion, and + * gives it 700ms to round-robin with no chance of + * being demoted. This is more than generous, so + * mark this sleep as non-interactive to prevent the + * on-runqueue bonus logic from intervening should + * this task not receive cpu immediately. + */ + p->sleep_type = SLEEP_NONINTERACTIVE; } else { /* * Tasks waking from uninterruptible sleep are @@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) * are likely to be waiting on I/O */ if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) + if (p->sleep_avg >= ceiling) sleep_time = 0; else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; + ceiling) { + p->sleep_avg = ceiling; + sleep_time = 0; } } @@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) */ p->sleep_avg += sleep_time; - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; } + if (p->sleep_avg > NS_MAX_SLEEP_AVG) + p->sleep_avg = NS_MAX_SLEEP_AVG; } return effective_prio(p); @@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - rq->nr_running--; + dec_nr_running(p, rq); dequeue_task(p, p->array); p->array = NULL; } @@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) * the target CPU. */ #ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + static void resched_task(task_t *p) { int cpu; @@ -833,9 +971,9 @@ static void resched_task(task_t *p) if (cpu == smp_processor_id()) return; - /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + /* NEED_RESCHED must be visible before we test polling */ smp_mb(); - if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } #else @@ -855,6 +993,12 @@ inline int task_curr(const task_t *p) return cpu_curr(task_cpu(p)) == p; } +/* Used instead of source_load when we know the type == 0 */ +unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->raw_weighted_load; +} + #ifdef CONFIG_SMP typedef struct { struct list_head list; @@ -944,7 +1088,8 @@ void kick_process(task_t *p) } /* - * Return a low guess at the load of a migration-source cpu. + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to * balance conservatively. @@ -952,24 +1097,36 @@ void kick_process(task_t *p) static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; - return min(rq->cpu_load[type-1], load_now); + return min(rq->cpu_load[type-1], rq->raw_weighted_load); } /* - * Return a high guess at the load of a migration-target cpu + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. */ static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) - return load_now; + return rq->raw_weighted_load; - return max(rq->cpu_load[type-1], load_now); + return max(rq->cpu_load[type-1], rq->raw_weighted_load); +} + +/* + * Return the average load per task on the cpu's run queue + */ +static inline unsigned long cpu_avg_load_per_task(int cpu) +{ + runqueue_t *rq = cpu_rq(cpu); + unsigned long n = rq->nr_running; + + return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; } /* @@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) cpus_and(tmp, group->cpumask, p->cpus_allowed); for_each_cpu_mask(i, tmp) { - load = source_load(i, 0); + load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { min_load = load; @@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag) struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; - for_each_domain(cpu, tmp) + for_each_domain(cpu, tmp) { + /* + * If power savings logic is enabled for a domain, stop there. + */ + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + break; if (tmp->flags & flag) sd = tmp; + } while (sd) { cpumask_t span; @@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; + unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load * of the current CPU: */ if (sync) - tl -= SCHED_LOAD_SCALE; + tl -= current->load_weight; if ((tl <= load && - tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || - 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->load_weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; + + /* + * Make sure we do not leak PI boosting priority to the child: + */ + p->prio = current->normal_prio; + INIT_LIST_HEAD(&p->run_list); p->array = NULL; #ifdef CONFIG_SCHEDSTATS @@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) __activate_task(p, rq); else { p->prio = current->prio; + p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - rq->nr_running++; + inc_nr_running(p, rq); } set_need_resched(); } else @@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void) unsigned long long nr_context_switches(void) { - unsigned long long i, sum = 0; + int i; + unsigned long long sum = 0; for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; @@ -1686,9 +1859,6 @@ unsigned long nr_active(void) /* * double_rq_lock - safely lock two runqueues * - * We must take them in cpu order to match code in - * dependent_sleeper and wake_dependent_sleeper. - * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ @@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) spin_lock(&rq1->lock); __acquire(rq2->lock); /* Fake it out ;) */ } else { - if (rq1->cpu < rq2->cpu) { + if (rq1 < rq2) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { @@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) __acquires(this_rq->lock) { if (unlikely(!spin_trylock(&busiest->lock))) { - if (busiest->cpu < this_rq->cpu) { + if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); @@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - src_rq->nr_running--; + dec_nr_running(p, src_rq); set_task_cpu(p, this_cpu); - this_rq->nr_running++; + inc_nr_running(p, this_rq); enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; } +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) /* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. + * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. * * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle, int *all_pinned) + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0, pinned = 0; + int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; + int busiest_best_prio_seen; + int skip_for_load; /* skip the task based on weighted load issues */ + long rem_load_move; task_t *tmp; - if (max_nr_move == 0) + if (max_nr_move == 0 || max_load_move == 0) goto out; + rem_load_move = max_load_move; pinned = 1; + this_best_prio = rq_best_prio(this_rq); + busiest_best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==busiest_best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; /* * We first consider expired tasks. Those will likely not be @@ -1907,7 +2093,17 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + /* + * To help distribute high priority tasks accross CPUs we don't + * skip a task if it will be the highest priority task (i.e. smallest + * prio value) on its new queue regardless of its load weight + */ + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; + if (skip_for_load || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + busiest_best_prio_seen |= idx == busiest_best_prio; if (curr != head) goto skip_queue; idx++; @@ -1921,9 +2117,15 @@ skip_queue: pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; + rem_load_move -= tmp->load_weight; - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of weighted load. + */ + if (pulled < max_nr_move && rem_load_move > 0) { + if (idx < this_best_prio) + this_best_prio = idx; if (curr != head) goto skip_queue; idx++; @@ -1944,7 +2146,7 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be + * domain. It calculates and returns the amount of weighted load which should be * moved to restore balance via the imbalance parameter. */ static struct sched_group * @@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; + unsigned long busiest_load_per_task, busiest_nr_running; + unsigned long this_load_per_task, this_nr_running; int load_idx; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance = 1; + unsigned long leader_nr_running = 0, min_load_per_task = 0; + unsigned long min_nr_running = ULONG_MAX; + struct sched_group *group_min = NULL, *group_leader = NULL; +#endif max_load = this_load = total_load = total_pwr = 0; + busiest_load_per_task = busiest_nr_running = 0; + this_load_per_task = this_nr_running = 0; if (idle == NOT_IDLE) load_idx = sd->busy_idx; else if (idle == NEWLY_IDLE) @@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = sd->idle_idx; do { - unsigned long load; + unsigned long load, group_capacity; int local_group; int i; + unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ - avg_load = 0; + sum_weighted_load = sum_nr_running = avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + runqueue_t *rq = cpu_rq(i); + if (*sd_idle && !idle_cpu(i)) *sd_idle = 0; @@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load = source_load(i, load_idx); avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += rq->raw_weighted_load; } total_load += avg_load; @@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + if (local_group) { this_load = avg_load; this = group; - } else if (avg_load > max_load) { + this_nr_running = sum_nr_running; + this_load_per_task = sum_weighted_load; + } else if (avg_load > max_load && + sum_nr_running > group_capacity) { max_load = avg_load; busiest = group; + busiest_nr_running = sum_nr_running; + busiest_load_per_task = sum_weighted_load; } + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto group_next; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (this_nr_running >= group_capacity || + !this_nr_running)) + power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!power_savings_balance || sum_nr_running >= group_capacity + || !sum_nr_running) + goto group_next; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sum_nr_running < min_nr_running) || + (sum_nr_running == min_nr_running && + first_cpu(group->cpumask) < + first_cpu(group_min->cpumask))) { + group_min = group; + min_nr_running = sum_nr_running; + min_load_per_task = sum_weighted_load / + sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sum_nr_running <= group_capacity - 1) + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + first_cpu(group->cpumask) > + first_cpu(group_leader->cpumask))) { + group_leader = group; + leader_nr_running = sum_nr_running; + } + +group_next: +#endif group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) + if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; + busiest_load_per_task /= busiest_nr_running; /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to @@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + if (max_load <= busiest_load_per_task) + goto out_balanced; + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (max_load < avg_load) { + *imbalance = 0; + goto small_imbalance; + } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; - if (*imbalance < SCHED_LOAD_SCALE) { - unsigned long pwr_now = 0, pwr_move = 0; + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < busiest_load_per_task) { + unsigned long pwr_now, pwr_move; unsigned long tmp; + unsigned int imbn; + +small_imbalance: + pwr_move = pwr_now = 0; + imbn = 2; + if (this_nr_running) { + this_load_per_task /= this_nr_running; + if (busiest_load_per_task > this_load_per_task) + imbn = 1; + } else + this_load_per_task = SCHED_LOAD_SCALE; - if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; + if (max_load - this_load >= busiest_load_per_task * imbn) { + *imbalance = busiest_load_per_task; return busiest; } @@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * moving them. */ - pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); - pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); + pwr_now += busiest->cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; if (max_load > tmp) - pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, - max_load - tmp); + pwr_move += busiest->cpu_power * + min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ if (max_load*busiest->cpu_power < - SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) + busiest_load_per_task*SCHED_LOAD_SCALE) tmp = max_load*busiest->cpu_power/this->cpu_power; else - tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); + tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; + pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move <= pwr_now) goto out_balanced; - *imbalance = 1; - return busiest; + *imbalance = busiest_load_per_task; } - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = *imbalance / SCHED_LOAD_SCALE; return busiest; out_balanced: +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + goto ret; + if (this == group_leader && group_leader != group_min) { + *imbalance = min_load_per_task; + return group_min; + } +ret: +#endif *imbalance = 0; return NULL; } @@ -2088,18 +2406,21 @@ out_balanced: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) + enum idle_type idle, unsigned long imbalance) { - unsigned long load, max_load = 0; - runqueue_t *busiest = NULL; + unsigned long max_load = 0; + runqueue_t *busiest = NULL, *rqi; int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i, 0); + rqi = cpu_rq(i); + + if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) + continue; - if (load > max_load) { - max_load = load; - busiest = cpu_rq(i); + if (rqi->raw_weighted_load > max_load) { + max_load = rqi->raw_weighted_load; + busiest = rqi; } } @@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, */ #define MAX_PINNED_INTERVAL 512 +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int active_balance = 0; int sd_idle = 0; - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); @@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, idle); + busiest = find_busiest_queue(group, idle, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); @@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + !sched_smt_power_savings) return -1; return nr_moved; @@ -2231,7 +2556,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; return 0; } @@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, int nr_moved = 0; int sd_idle = 0; - if (sd->flags & SD_SHARE_CPUPOWER) + if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); @@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE); + busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); } @@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) return -1; sd->nr_balance_failed = 0; return 0; @@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ - for_each_domain(target_cpu, sd) + for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpu_isset(busiest_cpu, sd->span)) break; + } if (unlikely(sd == NULL)) goto out; schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, + RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd; int i; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + this_load = this_rq->raw_weighted_load; /* Update our load */ for (i = 0; i < 3; i++) { unsigned long new_load = this_load; @@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) resched_task(rq->idle); } -static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +/* + * Called with interrupt disabled and this_rq's runqueue locked. + */ +static void wake_sleeping_dependent(int this_cpu) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; int i; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return; - /* - * Unlock the current runqueue because we have to lock in - * CPU order to avoid deadlocks. Caller knows that we might - * unlock. We keep IRQs disabled. - */ - spin_unlock(&this_rq->lock); - - sibling_map = sd->span; - - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - /* - * We clear this CPU from the mask. This both simplifies the - * inner loop and keps this_rq locked when we exit: - */ - cpu_clear(this_cpu, sibling_map); - - for_each_cpu_mask(i, sibling_map) { + for_each_cpu_mask(i, sd->span) { runqueue_t *smt_rq = cpu_rq(i); + if (i == this_cpu) + continue; + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; + wakeup_busy_runqueue(smt_rq); + spin_unlock(&smt_rq->lock); } - - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); - /* - * We exit with this_cpu's rq still held and IRQs - * still disabled: - */ } /* @@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) return p->time_slice * (100 - sd->per_cpu_gain) / 100; } -static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +/* + * To minimise lock contention and not have to drop this_rq's runlock we only + * trylock the sibling runqueues and bypass those runqueues if we fail to + * acquire their lock. As we only trylock the normal locking order does not + * need to be obeyed. + */ +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) { struct sched_domain *tmp, *sd = NULL; - cpumask_t sibling_map; - prio_array_t *array; int ret = 0, i; - task_t *p; - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_SHARE_CPUPOWER) + /* kernel/rt threads do not participate in dependent sleeping */ + if (!p->mm || rt_task(p)) + return 0; + + for_each_domain(this_cpu, tmp) { + if (tmp->flags & SD_SHARE_CPUPOWER) { sd = tmp; + break; + } + } if (!sd) return 0; - /* - * The same locking rules and details apply as for - * wake_sleeping_dependent(): - */ - spin_unlock(&this_rq->lock); - sibling_map = sd->span; - for_each_cpu_mask(i, sibling_map) - spin_lock(&cpu_rq(i)->lock); - cpu_clear(this_cpu, sibling_map); + for_each_cpu_mask(i, sd->span) { + runqueue_t *smt_rq; + task_t *smt_curr; - /* - * Establish next task to be run - it might have gone away because - * we released the runqueue lock above: - */ - if (!this_rq->nr_running) - goto out_unlock; - array = this_rq->active; - if (!array->nr_active) - array = this_rq->expired; - BUG_ON(!array->nr_active); + if (i == this_cpu) + continue; - p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, - task_t, run_list); + smt_rq = cpu_rq(i); + if (unlikely(!spin_trylock(&smt_rq->lock))) + continue; - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq = cpu_rq(i); - task_t *smt_curr = smt_rq->curr; + smt_curr = smt_rq->curr; - /* Kernel threads do not participate in dependent sleeping */ - if (!p->mm || !smt_curr->mm || rt_task(p)) - goto check_smt_task; + if (!smt_curr->mm) + goto unlock; /* * If a user task with lower static priority than the @@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) if ((jiffies % DEF_TIMESLICE) > (sd->per_cpu_gain * DEF_TIMESLICE / 100)) ret = 1; - } else + } else { if (smt_curr->static_prio < p->static_prio && !TASK_PREEMPTS_CURR(p, smt_rq) && smt_slice(smt_curr, sd) > task_timeslice(p)) ret = 1; - -check_smt_task: - if ((!smt_curr->mm && smt_curr != smt_rq->idle) || - rt_task(smt_curr)) - continue; - if (!p->mm) { - wakeup_busy_runqueue(smt_rq); - continue; - } - - /* - * Reschedule a lower priority task on the SMT sibling for - * it to be put to sleep, or wake it up if it has been put to - * sleep for priority reasons to see if it should run now. - */ - if (rt_task(p)) { - if ((jiffies % DEF_TIMESLICE) > - (sd->per_cpu_gain * DEF_TIMESLICE / 100)) - resched_task(smt_curr); - } else { - if (TASK_PREEMPTS_CURR(p, smt_rq) && - smt_slice(p, sd) > task_timeslice(smt_curr)) - resched_task(smt_curr); - else - wakeup_busy_runqueue(smt_rq); } +unlock: + spin_unlock(&smt_rq->lock); } -out_unlock: - for_each_cpu_mask(i, sibling_map) - spin_unlock(&cpu_rq(i)->lock); return ret; } #else -static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) +static inline void wake_sleeping_dependent(int this_cpu) { } -static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, + task_t *p) { return 0; } @@ -2967,32 +3251,13 @@ need_resched_nonpreemptible: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { -go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); - /* - * wake_sleeping_dependent() might have released - * the runqueue, so break out if we got new - * tasks meanwhile: - */ - if (!rq->nr_running) - goto switch_tasks; - } - } else { - if (dependent_sleeper(cpu, rq)) { - next = rq->idle; + wake_sleeping_dependent(cpu); goto switch_tasks; } - /* - * dependent_sleeper() releases and reacquires the runqueue - * lock, hence go into the idle loop if the rq went - * empty meanwhile: - */ - if (unlikely(!rq->nr_running)) - goto go_idle; } array = rq->active; @@ -3030,6 +3295,8 @@ go_idle: } } next->sleep_type = SLEEP_NORMAL; + if (dependent_sleeper(cpu, rq, next)) + next = rq->idle; switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); @@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(task_t *p, int prio) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} + +#endif + void set_user_nice(task_t *p, long nice) { unsigned long flags; prio_array_t *array; runqueue_t *rq; - int old_prio, new_prio, delta; + int old_prio, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (rt_task(p)) { + if (has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } array = p->array; - if (array) + if (array) { dequeue_task(p, array); + dec_raw_weighted_load(rq, p); + } - old_prio = p->prio; - new_prio = NICE_TO_PRIO(nice); - delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); - p->prio += delta; + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; if (array) { enqueue_task(p, array); + inc_raw_weighted_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) out_unlock: task_rq_unlock(rq, &flags); } - EXPORT_SYMBOL(set_user_nice); /* @@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) BUG_ON(p->array); p->policy = policy; p->rt_priority = prio; - if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { - p->prio = MAX_RT_PRIO-1 - p->rt_priority; - } else { - p->prio = p->static_prio; - /* - * SCHED_BATCH tasks are treated as perpetual CPU hogs: - */ - if (policy == SCHED_BATCH) - p->sleep_avg = 0; - } + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + /* + * SCHED_BATCH tasks are treated as perpetual CPU hogs: + */ + if (policy == SCHED_BATCH) + p->sleep_avg = 0; + set_load_weight(p); } /** @@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; runqueue_t *rq; + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ if (policy < 0) @@ -3710,14 +4033,20 @@ recheck: if (retval) return retval; /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + */ + spin_lock_irqsave(&p->pi_lock, flags); + /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } array = p->array; @@ -3738,7 +4067,11 @@ recheck: } else if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); + + rt_mutex_adjust_pi(p); + return 0; } EXPORT_SYMBOL_GPL(sched_setscheduler); @@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - retval = sched_setscheduler(p, policy, &lparam); + get_task_struct(p); read_unlock_irq(&tasklist_lock); + retval = sched_setscheduler(p, policy, &lparam); + put_task_struct(p); return retval; } @@ -3886,6 +4221,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) !capable(CAP_SYS_NICE)) goto out_unlock; + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); retval = set_cpus_allowed(p, new_mask); @@ -3954,7 +4293,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (!p) goto out_unlock; - retval = 0; + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: @@ -4046,6 +4388,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif /* * The BKS might be reacquired before we have dropped * PREEMPT_ACTIVE, which could trigger a second @@ -4142,7 +4487,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4153,7 +4498,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); @@ -4237,7 +4582,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? + jiffies_to_timespec(p->policy == SCHED_FIFO ? 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -4363,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu) idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; - idle->prio = MAX_PRIO; + idle->prio = idle->normal_prio = MAX_PRIO; idle->state = TASK_RUNNING; idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); @@ -4459,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. */ -static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; + int ret = 0; if (unlikely(cpu_is_offline(dest_cpu))) - return; + return ret; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); @@ -4493,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); } - + ret = 1; out: double_rq_unlock(rq_src, rq_dest); + return ret; } /* @@ -4565,9 +4914,12 @@ wait_to_die: /* Figure out where task on dead CPU should go, use force if neccessary. */ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) { + runqueue_t *rq; + unsigned long flags; int dest_cpu; cpumask_t mask; +restart: /* On same node? */ mask = node_to_cpumask(cpu_to_node(dead_cpu)); cpus_and(mask, mask, tsk->cpus_allowed); @@ -4579,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) /* No more Mr. Nice Guy. */ if (dest_cpu == NR_CPUS) { + rq = task_rq_lock(tsk, &flags); cpus_setall(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); + task_rq_unlock(rq, &flags); /* * Don't tell them about moving exiting tasks or @@ -4592,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) "longer affine to cpu%d\n", tsk->pid, tsk->comm, dead_cpu); } - __migrate_task(tsk, dead_cpu, dest_cpu); + if (!__migrate_task(tsk, dead_cpu, dest_cpu)) + goto restart; } /* @@ -4719,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ -static int migration_call(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int __cpuinit migration_call(struct notifier_block *nfb, + unsigned long action, + void *hcpu) { int cpu = (long)hcpu; struct task_struct *p; @@ -4746,6 +5102,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!cpu_rq(cpu)->migration_thread) + break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); @@ -4788,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ -static struct notifier_block migration_notifier = { +static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, .priority = 10 }; @@ -5589,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node) } #endif +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we * can switch it on easily if needed. @@ -5604,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu) #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct sched_domain, core_domains); -static struct sched_group sched_group_core[NR_CPUS]; +static struct sched_group *sched_group_core_bycpu[NR_CPUS]; #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) @@ -5620,7 +5979,7 @@ static int cpu_to_core_group(int cpu) #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static struct sched_group sched_group_phys[NR_CPUS]; +static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; static int cpu_to_phys_group(int cpu) { #if defined(CONFIG_SCHED_MC) @@ -5677,13 +6036,74 @@ next_sg: } #endif +/* Free memory allocated for various sched_group structures */ +static void free_sched_groups(const cpumask_t *cpu_map) +{ + int cpu; +#ifdef CONFIG_NUMA + int i; + + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif + for_each_cpu_mask(cpu, *cpu_map) { + if (sched_group_phys_bycpu[cpu]) { + kfree(sched_group_phys_bycpu[cpu]); + sched_group_phys_bycpu[cpu] = NULL; + } +#ifdef CONFIG_SCHED_MC + if (sched_group_core_bycpu[cpu]) { + kfree(sched_group_core_bycpu[cpu]); + sched_group_core_bycpu[cpu] = NULL; + } +#endif + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -void build_sched_domains(const cpumask_t *cpu_map) +static int build_sched_domains(const cpumask_t *cpu_map) { int i; + struct sched_group *sched_group_phys = NULL; +#ifdef CONFIG_SCHED_MC + struct sched_group *sched_group_core = NULL; +#endif #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; struct sched_group *sched_group_allnodes = NULL; @@ -5691,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map) /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, - GFP_ATOMIC); + sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); - return; + return -ENOMEM; } sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif @@ -5721,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map) if (!sched_group_allnodes) { printk(KERN_WARNING "Can not alloc allnodes sched group\n"); - break; + goto error; } sched_group_allnodes_bycpu[i] = sched_group_allnodes; @@ -5742,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map) cpus_and(sd->span, sd->span, *cpu_map); #endif + if (!sched_group_phys) { + sched_group_phys + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_phys) { + printk (KERN_WARNING "Can not alloc phys sched" + "group\n"); + goto error; + } + sched_group_phys_bycpu[i] = sched_group_phys; + } + p = sd; sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); @@ -5751,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map) sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_MC + if (!sched_group_core) { + sched_group_core + = kmalloc(sizeof(struct sched_group) * NR_CPUS, + GFP_KERNEL); + if (!sched_group_core) { + printk (KERN_WARNING "Can not alloc core sched" + "group\n"); + goto error; + } + sched_group_core_bycpu[i] = sched_group_core; + } + p = sd; sd = &per_cpu(core_domains, i); group = cpu_to_core_group(i); @@ -5834,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map) domainspan = sched_domain_node_span(i); cpus_and(domainspan, domainspan, *cpu_map); - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for " + "node %d\n", i); + goto error; + } sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; - if (sd->groups == NULL) { - /* Turn off balancing if we have no groups */ - sd->flags = 0; - } - } - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", i); - continue; } sg->cpu_power = 0; sg->cpumask = nodemask; + sg->next = sg; cpus_or(covered, covered, nodemask); prev = sg; @@ -5870,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map) if (cpus_empty(tmp)) continue; - sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sg = kmalloc_node(sizeof(struct sched_group), + GFP_KERNEL, i); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", j); - break; + goto error; } sg->cpu_power = 0; sg->cpumask = tmp; + sg->next = prev->next; cpus_or(covered, covered, tmp); prev->next = sg; prev = sg; } - prev->next = sched_group_nodes[i]; } #endif /* Calculate CPU power for physical packages and nodes */ +#ifdef CONFIG_SCHED_SMT for_each_cpu_mask(i, *cpu_map) { - int power; struct sched_domain *sd; -#ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); - power = SCHED_LOAD_SCALE; - sd->groups->cpu_power = power; + sd->groups->cpu_power = SCHED_LOAD_SCALE; + } #endif #ifdef CONFIG_SCHED_MC + for_each_cpu_mask(i, *cpu_map) { + int power; + struct sched_domain *sd; sd = &per_cpu(core_domains, i); - power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) * SCHED_LOAD_SCALE / 10; sd->groups->cpu_power = power; + } +#endif + for_each_cpu_mask(i, *cpu_map) { + struct sched_domain *sd; +#ifdef CONFIG_SCHED_MC sd = &per_cpu(phys_domains, i); + if (i != first_cpu(sd->groups->cpumask)) + continue; - /* - * This has to be < 2 * SCHED_LOAD_SCALE - * Lets keep it SCHED_LOAD_SCALE, so that - * while calculating NUMA group's cpu_power - * we can simply do - * numa_group->cpu_power += phys_group->cpu_power; - * - * See "only add power once for each physical pkg" - * comment below - */ - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = 0; + if (sched_mc_power_savings || sched_smt_power_savings) { + int j; + + for_each_cpu_mask(j, sd->groups->cpumask) { + struct sched_domain *sd1; + sd1 = &per_cpu(core_domains, j); + /* + * for each core we will add once + * to the group in physical domain + */ + if (j != first_cpu(sd1->groups->cpumask)) + continue; + + if (sched_smt_power_savings) + sd->groups->cpu_power += sd1->groups->cpu_power; + else + sd->groups->cpu_power += SCHED_LOAD_SCALE; + } + } else + /* + * This has to be < 2 * SCHED_LOAD_SCALE + * Lets keep it SCHED_LOAD_SCALE, so that + * while calculating NUMA group's cpu_power + * we can simply do + * numa_group->cpu_power += phys_group->cpu_power; + * + * See "only add power once for each physical pkg" + * comment below + */ + sd->groups->cpu_power = SCHED_LOAD_SCALE; #else + int power; sd = &per_cpu(phys_domains, i); - power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * - (cpus_weight(sd->groups->cpumask)-1) / 10; + if (sched_smt_power_savings) + power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); + else + power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif } @@ -5945,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map) * Tune cache-hot values: */ calibrate_migration_costs(cpu_map); + + return 0; + +error: + free_sched_groups(cpu_map); + return -ENOMEM; } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(const cpumask_t *cpu_map) +static int arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; + int err; /* * Setup mask for cpus without special case scheduling requirements. @@ -5960,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) */ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); - build_sched_domains(&cpu_default_map); + err = build_sched_domains(&cpu_default_map); + + return err; } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { -#ifdef CONFIG_NUMA - int i; - int cpu; - - for_each_cpu_mask(cpu, *cpu_map) { - struct sched_group *sched_group_allnodes - = sched_group_allnodes_bycpu[cpu]; - struct sched_group **sched_group_nodes - = sched_group_nodes_bycpu[cpu]; - - if (sched_group_allnodes) { - kfree(sched_group_allnodes); - sched_group_allnodes_bycpu[cpu] = NULL; - } - - if (!sched_group_nodes) - continue; - - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); - struct sched_group *oldsg, *sg = sched_group_nodes[i]; - - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) - continue; - - if (sg == NULL) - continue; - sg = sg->next; -next_sg: - oldsg = sg; - sg = sg->next; - kfree(oldsg); - if (oldsg != sched_group_nodes[i]) - goto next_sg; - } - kfree(sched_group_nodes); - sched_group_nodes_bycpu[cpu] = NULL; - } -#endif + free_sched_groups(cpu_map); } /* @@ -6029,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * correct sched domains * Call with hotplug lock held */ -void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) { cpumask_t change_map; + int err = 0; cpus_and(*partition1, *partition1, cpu_online_map); cpus_and(*partition2, *partition2, cpu_online_map); @@ -6040,11 +6488,87 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) /* Detach sched domains from all of the affected cpus */ detach_destroy_domains(&change_map); if (!cpus_empty(*partition1)) - build_sched_domains(partition1); - if (!cpus_empty(*partition2)) - build_sched_domains(partition2); + err = build_sched_domains(partition1); + if (!err && !cpus_empty(*partition2)) + err = build_sched_domains(partition2); + + return err; +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +int arch_reinit_sched_domains(void) +{ + int err; + + lock_cpu_hotplug(); + detach_destroy_domains(&cpu_online_map); + err = arch_init_sched_domains(&cpu_online_map); + unlock_cpu_hotplug(); + + return err; +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + int ret; + + if (buf[0] != '0' && buf[0] != '1') + return -EINVAL; + + if (smt) + sched_smt_power_savings = (buf[0] == '1'); + else + sched_mc_power_savings = (buf[0] == '1'); + + ret = arch_reinit_sched_domains(); + + return ret ? ret : count; } +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + + #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains @@ -6126,7 +6650,6 @@ void __init sched_init(void) rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->cpu = i; #endif atomic_set(&rq->nr_iowait, 0); @@ -6141,6 +6664,7 @@ void __init sched_init(void) } } + set_load_weight(&init_task); /* * The boot idle thread does lazy MMU switching as well: */ @@ -6187,11 +6711,12 @@ void normalize_rt_tasks(void) runqueue_t *rq; read_lock_irq(&tasklist_lock); - for_each_process (p) { + for_each_process(p) { if (!rt_task(p)) continue; - rq = task_rq_lock(p, &flags); + spin_lock_irqsave(&p->pi_lock, flags); + rq = __task_rq_lock(p); array = p->array; if (array) @@ -6202,7 +6727,8 @@ void normalize_rt_tasks(void) resched_task(rq->curr); } - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); + spin_unlock_irqrestore(&p->pi_lock, flags); } read_unlock_irq(&tasklist_lock); } diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ff..52adf53929f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,12 +23,12 @@ #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> -#include <linux/audit.h> #include <linux/capability.h> #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> +#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. @@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) spin_unlock_irqrestore(&sighand->siglock, flags); } +static inline int may_ptrace_stop(void) +{ + if (!likely(current->ptrace & PT_PTRACED)) + return 0; + + if (unlikely(current->parent == current->real_parent && + (current->ptrace & PT_ATTACHED))) + return 0; + + if (unlikely(current->signal == current->parent->signal) && + unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) + return 0; + + /* + * Are we in the middle of do_coredump? + * If so and our tracer is also part of the coredump stopping + * is a deadlock situation, and pointless because our tracer + * is dead so don't allow us to stop. + * If SIGKILL was already sent before the caller unlocked + * ->siglock we must see ->core_waiters != 0. Otherwise it + * is safe to enter schedule(). + */ + if (unlikely(current->mm->core_waiters) && + unlikely(current->mm == current->parent->mm)) + return 0; + + return 1; +} + /* * This must be called with current->sighand->siglock held. * @@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); try_to_freeze(); read_lock(&tasklist_lock); - if (likely(current->ptrace & PT_PTRACED) && - likely(current->parent != current->real_parent || - !(current->ptrace & PT_ATTACHED)) && - (likely(current->parent->signal != current->signal) || - !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2..8f03e3b89b5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int cpu_callback(struct notifier_block *nfb, +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(ksoftirqd, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); @@ -484,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf0290..6b76caa2298 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __devinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(watchdog_task, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); @@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __devinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d73146..2c0aacc37c5 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/syscalls.h> +#include <linux/kthread.h> #include <asm/atomic.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *cpu) +static int stopmachine(void *unused) { int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); - /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int i, ret = 0; + int ret = 0; + unsigned int i; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,11 +96,16 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { + struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) + tsk = kthread_create(stopmachine, NULL, "stopmachine"); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); break; + } + kthread_bind(tsk, i); + wake_up_process(tsk); stopmachine_num_threads++; } diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936..2d5179c67ce 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,7 +13,6 @@ #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> -#include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -57,6 +56,12 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb; + struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); while (nb) { + next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = rcu_dereference(nb->next); + nb = next_nb; } return ret; } @@ -583,7 +589,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart_prepare(char *cmd) +static void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -void kernel_kexec(void) +static void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -631,7 +637,6 @@ void kernel_kexec(void) machine_kexec(image); #endif } -EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { @@ -1860,23 +1865,20 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * tasklist_lock locking optimisation: - * If we are current and single threaded, we do not need to take the tasklist - * lock or the siglock. No one else can take our signal_struct away, - * no one else can reap the children to update signal->c* counters, and - * no one else can race with the signal-> fields. - * If we do not take the tasklist_lock, the signal-> fields could be read - * out of order while another thread was just exiting. So we place a - * read memory barrier when we avoid the lock. On the writer side, - * write memory barrier is implied in __exit_signal as __exit_signal releases - * the siglock spinlock after updating the signal-> fields. - * - * We don't really need the siglock when we access the non c* fields - * of the signal_struct (for RUSAGE_SELF) even in multithreaded - * case, since we take the tasklist lock for read and the non c* signal-> - * fields are updated only in __exit_signal, which is called with - * tasklist_lock taken for write, hence these two threads cannot execute - * concurrently. + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. * */ @@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; - int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - if (p != current || !thread_group_empty(p)) - need_lock = 1; - - if (need_lock) { - read_lock(&tasklist_lock); - if (unlikely(!p->signal)) { - read_unlock(&tasklist_lock); - return; - } - } else - /* See locking comments above */ - smp_rmb(); + rcu_read_lock(); + if (!lock_task_sighand(p, &flags)) { + rcu_read_unlock(); + return; + } switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; - spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - if (need_lock) - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + rcu_read_unlock(); + cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } + case PR_GET_ENDIAN: + error = GET_ENDIAN(current, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(current, arg2); + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f..6991bece67e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); @@ -132,3 +133,4 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeef..93a2c539864 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern int sysctl_panic_on_oom; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; @@ -72,6 +73,7 @@ extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; +extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -131,6 +133,10 @@ extern int acct_parm[]; extern int no_unaligned_warning; #endif +#ifdef CONFIG_RT_MUTEXES +extern int max_lock_depth; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -142,7 +148,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; static ctl_table dev_table[]; @@ -150,7 +155,7 @@ extern ctl_table random_table[]; #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER extern ctl_table inotify_table[]; #endif @@ -202,12 +207,6 @@ static ctl_table root_table[] = { }, #endif { - .ctl_name = CTL_PROC, - .procname = "proc", - .mode = 0555, - .child = proc_table, - }, - { .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, @@ -398,7 +397,7 @@ static ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", @@ -683,6 +682,27 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_COMPAT + { + .ctl_name = KERN_COMPAT_LOG, + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .ctl_name = KERN_MAX_LOCK_DEPTH, + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { .ctl_name = 0 } }; @@ -702,6 +722,14 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = VM_PANIC_ON_OOM, + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, @@ -915,10 +943,18 @@ static ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif - { .ctl_name = 0 } -}; - -static ctl_table proc_table[] = { +#ifdef CONFIG_X86_32 + { + .ctl_name = VM_VDSO_ENABLED, + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif { .ctl_name = 0 } }; @@ -1028,7 +1064,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, .procname = "inotify", diff --git a/kernel/time.c b/kernel/time.c index b00ddc71ced..5bd48974764 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 00000000000..e1dfd8e86cc --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1 @@ +obj-y += clocksource.o jiffies.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 00000000000..74eca5939bd --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include <linux/clocksource.h> +#include <linux/sysdev.h> +#include <linux/init.h> +#include <linux/module.h> + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * clocksource_get_next - Returns the selected clocksource + * + */ +struct clocksource *clocksource_get_next(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} +EXPORT_SYMBOL(clocksource_register); + +/** + * clocksource_reselect - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scanned for the best + * clocksource. + */ +void clocksource_reselect(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} +EXPORT_SYMBOL(clocksource_reselect); + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 00000000000..126bb30c4af --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/init.h> + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468..5a896025306 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); @@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(tvec_base_t *base, tvec_t *tv, int index) { /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); - head = tv->vec + index; - curr = head->next; /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(timer->base != base); + internal_add_timer(base, timer); } - INIT_LIST_HEAD(head); return index; } @@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -751,27 +746,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void) * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ u64 current_tick_length(void) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); + + return ret; } -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include <linux/clocksource.h> +static struct clocksource *clock; /* pointer to current clocksource */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + new = clocksource_get_next(); + if (clock != new) { + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running */ -static void update_wall_time(unsigned long ticks) +int timekeeping_is_continuous(void) { + unsigned long seq; + int ret; + do { - ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + clock->cycle_last = clocksource_read(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + clock->cycle_last = clocksource_read(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead another tick, + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +{ + int adj; + + /* + * As soon as the machine is synchronized to the external time + * source this should be the common case. + */ + error >>= 2; + if (likely(sign > 0 ? error <= *interval : error >= *interval)) + return sign; + + /* + * An extra look ahead dampens the effect of the current error, + * which can grow quite large with continously late updates, as + * it would dominate the adjustment value and can lead to + * oscillation. + */ + error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + error -= clock->xtime_interval >> 1; + + adj = 0; + while (1) { + error >>= 1; + if (sign > 0 ? error <= *interval : error >= *interval) + break; + adj++; + } + + /* + * Add the current adjustments to the error and take the offset + * into account, the latter can cause the error to be hardly + * reduced at the next tick. Check the error again if there's + * room for another adjustment, thus further reducing the error + * which otherwise had to be corrected at the next update. + */ + error = (error << 1) - *interval + *offset; + if (sign > 0 ? error > *interval : error < *interval) + adj++; + + *interval <<= adj; + *offset <<= adj; + return sign << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + adj = clocksource_bigadjust(1, error, &interval, &offset); + } else if (error < -interval) { + interval = -interval; + offset = -offset; + adj = clocksource_bigadjust(-1, error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); +} + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + cycle_t offset; + + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } - } while (ticks); + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, tick_nsec); + } } /* @@ -919,10 +1243,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } @@ -1330,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int timer_cpu_notify(struct notifier_block *self, +static int __devinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1350,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block timers_nb = { +static struct notifier_block __devinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 00000000000..f69c804c8e6 --- /dev/null +++ b/kernel/unwind.c @@ -0,0 +1,918 @@ +/* + * Copyright (C) 2002-2006 Novell, Inc. + * Jan Beulich <jbeulich@novell.com> + * This code is released under version 2 of the GNU GPL. + * + * A simple API for unwinding kernel stacks. This is used for + * debugging and error reporting purposes. The kernel doesn't need + * full-blown stack unwinding with all the bells and whistles, so there + * is not much point in implementing the full Dwarf2 unwind API. + */ + +#include <linux/unwind.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/stop_machine.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> + +extern char __start_unwind[], __end_unwind[]; + +#define MAX_STACK_DEPTH 8 + +#define EXTRA_INFO(f) { \ + BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ + % FIELD_SIZEOF(struct unwind_frame_info, f)) \ + + offsetof(struct unwind_frame_info, f) \ + / FIELD_SIZEOF(struct unwind_frame_info, f), \ + FIELD_SIZEOF(struct unwind_frame_info, f) \ + } +#define PTREGS_INFO(f) EXTRA_INFO(regs.f) + +static const struct { + unsigned offs:BITS_PER_LONG / 2; + unsigned width:BITS_PER_LONG / 2; +} reg_info[] = { + UNW_REGISTER_INFO +}; + +#undef PTREGS_INFO +#undef EXTRA_INFO + +#ifndef REG_INVALID +#define REG_INVALID(r) (reg_info[r].width == 0) +#endif + +#define DW_CFA_nop 0x00 +#define DW_CFA_set_loc 0x01 +#define DW_CFA_advance_loc1 0x02 +#define DW_CFA_advance_loc2 0x03 +#define DW_CFA_advance_loc4 0x04 +#define DW_CFA_offset_extended 0x05 +#define DW_CFA_restore_extended 0x06 +#define DW_CFA_undefined 0x07 +#define DW_CFA_same_value 0x08 +#define DW_CFA_register 0x09 +#define DW_CFA_remember_state 0x0a +#define DW_CFA_restore_state 0x0b +#define DW_CFA_def_cfa 0x0c +#define DW_CFA_def_cfa_register 0x0d +#define DW_CFA_def_cfa_offset 0x0e +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_expression 0x10 +#define DW_CFA_offset_extended_sf 0x11 +#define DW_CFA_def_cfa_sf 0x12 +#define DW_CFA_def_cfa_offset_sf 0x13 +#define DW_CFA_val_offset 0x14 +#define DW_CFA_val_offset_sf 0x15 +#define DW_CFA_val_expression 0x16 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_GNU_window_save 0x2d +#define DW_CFA_GNU_args_size 0x2e +#define DW_CFA_GNU_negative_offset_extended 0x2f +#define DW_CFA_hi_user 0x3f + +#define DW_EH_PE_FORM 0x07 +#define DW_EH_PE_native 0x00 +#define DW_EH_PE_leb128 0x01 +#define DW_EH_PE_data2 0x02 +#define DW_EH_PE_data4 0x03 +#define DW_EH_PE_data8 0x04 +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_ADJUST 0x70 +#define DW_EH_PE_abs 0x00 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 +#define DW_EH_PE_omit 0xff + +typedef unsigned long uleb128_t; +typedef signed long sleb128_t; + +static struct unwind_table { + struct { + unsigned long pc; + unsigned long range; + } core, init; + const void *address; + unsigned long size; + struct unwind_table *link; + const char *name; +} root_table, *last_table; + +struct unwind_item { + enum item_location { + Nowhere, + Memory, + Register, + Value + } where; + uleb128_t value; +}; + +struct unwind_state { + uleb128_t loc, org; + const u8 *cieStart, *cieEnd; + uleb128_t codeAlign; + sleb128_t dataAlign; + struct cfa { + uleb128_t reg, offs; + } cfa; + struct unwind_item regs[ARRAY_SIZE(reg_info)]; + unsigned stackDepth:8; + unsigned version:8; + const u8 *label; + const u8 *stack[MAX_STACK_DEPTH]; +}; + +static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; + +static struct unwind_table *find_table(unsigned long pc) +{ + struct unwind_table *table; + + for (table = &root_table; table; table = table->link) + if ((pc >= table->core.pc + && pc < table->core.pc + table->core.range) + || (pc >= table->init.pc + && pc < table->init.pc + table->init.range)) + break; + + return table; +} + +static void init_unwind_table(struct unwind_table *table, + const char *name, + const void *core_start, + unsigned long core_size, + const void *init_start, + unsigned long init_size, + const void *table_start, + unsigned long table_size) +{ + table->core.pc = (unsigned long)core_start; + table->core.range = core_size; + table->init.pc = (unsigned long)init_start; + table->init.range = init_size; + table->address = table_start; + table->size = table_size; + table->link = NULL; + table->name = name; +} + +void __init unwind_init(void) +{ + init_unwind_table(&root_table, "kernel", + _text, _end - _text, + NULL, 0, + __start_unwind, __end_unwind - __start_unwind); +} + +#ifdef CONFIG_MODULES + +/* Must be called with module_mutex held. */ +void *unwind_add_table(struct module *module, + const void *table_start, + unsigned long table_size) +{ + struct unwind_table *table; + + if (table_size <= 0) + return NULL; + + table = kmalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + init_unwind_table(table, module->name, + module->module_core, module->core_size, + module->module_init, module->init_size, + table_start, table_size); + + if (last_table) + last_table->link = table; + else + root_table.link = table; + last_table = table; + + return table; +} + +struct unlink_table_info +{ + struct unwind_table *table; + int init_only; +}; + +static int unlink_table(void *arg) +{ + struct unlink_table_info *info = arg; + struct unwind_table *table = info->table, *prev; + + for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) + ; + + if (prev->link) { + if (info->init_only) { + table->init.pc = 0; + table->init.range = 0; + info->table = NULL; + } else { + prev->link = table->link; + if (!prev->link) + last_table = prev; + } + } else + info->table = NULL; + + return 0; +} + +/* Must be called with module_mutex held. */ +void unwind_remove_table(void *handle, int init_only) +{ + struct unwind_table *table = handle; + struct unlink_table_info info; + + if (!table || table == &root_table) + return; + + if (init_only && table == last_table) { + table->init.pc = 0; + table->init.range = 0; + return; + } + + info.table = table; + info.init_only = init_only; + stop_machine_run(unlink_table, &info, NR_CPUS); + + if (info.table) + kfree(table); +} + +#endif /* CONFIG_MODULES */ + +static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + uleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (uleb128_t)(*cur & 0x7f) << shift; + if (!(*cur++ & 0x80)) + break; + } + *pcur = cur; + + return value; +} + +static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + sleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (sleb128_t)(*cur & 0x7f) << shift; + if (!(*cur & 0x80)) { + value |= -(*cur++ & 0x40) << shift; + break; + } + } + *pcur = cur; + + return value; +} + +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType) +{ + unsigned long value = 0; + union { + const u8 *p8; + const u16 *p16u; + const s16 *p16s; + const u32 *p32u; + const s32 *p32s; + const unsigned long *pul; + } ptr; + + if (ptrType < 0 || ptrType == DW_EH_PE_omit) + return 0; + ptr.p8 = *pLoc; + switch(ptrType & DW_EH_PE_FORM) { + case DW_EH_PE_data2: + if (end < (const void *)(ptr.p16u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p16s++); + else + value = get_unaligned(ptr.p16u++); + break; + case DW_EH_PE_data4: +#ifdef CONFIG_64BIT + if (end < (const void *)(ptr.p32u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p32s++); + else + value = get_unaligned(ptr.p32u++); + break; + case DW_EH_PE_data8: + BUILD_BUG_ON(sizeof(u64) != sizeof(value)); +#else + BUILD_BUG_ON(sizeof(u32) != sizeof(value)); +#endif + case DW_EH_PE_native: + if (end < (const void *)(ptr.pul + 1)) + return 0; + value = get_unaligned(ptr.pul++); + break; + case DW_EH_PE_leb128: + BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); + value = ptrType & DW_EH_PE_signed + ? get_sleb128(&ptr.p8, end) + : get_uleb128(&ptr.p8, end); + if ((const void *)ptr.p8 > end) + return 0; + break; + default: + return 0; + } + switch(ptrType & DW_EH_PE_ADJUST) { + case DW_EH_PE_abs: + break; + case DW_EH_PE_pcrel: + value += (unsigned long)*pLoc; + break; + default: + return 0; + } + if ((ptrType & DW_EH_PE_indirect) + && __get_user(value, (unsigned long *)value)) + return 0; + *pLoc = ptr.p8; + + return value; +} + +static signed fde_pointer_type(const u32 *cie) +{ + const u8 *ptr = (const u8 *)(cie + 2); + unsigned version = *ptr; + + if (version != 1) + return -1; /* unsupported */ + if (*++ptr) { + const char *aug; + const u8 *end = (const u8 *)(cie + 1) + *cie; + uleb128_t len; + + /* check if augmentation size is first (and thus present) */ + if (*ptr != 'z') + return -1; + /* check if augmentation string is nul-terminated */ + if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) + return -1; + ++ptr; /* skip terminator */ + get_uleb128(&ptr, end); /* skip code alignment */ + get_sleb128(&ptr, end); /* skip data alignment */ + /* skip return address column */ + version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); + len = get_uleb128(&ptr, end); /* augmentation length */ + if (ptr + len < ptr || ptr + len > end) + return -1; + end = ptr + len; + while (*++aug) { + if (ptr >= end) + return -1; + switch(*aug) { + case 'L': + ++ptr; + break; + case 'P': { + signed ptrType = *ptr++; + + if (!read_pointer(&ptr, end, ptrType) || ptr > end) + return -1; + } + break; + case 'R': + return *ptr; + default: + return -1; + } + } + } + return DW_EH_PE_native|DW_EH_PE_abs; +} + +static int advance_loc(unsigned long delta, struct unwind_state *state) +{ + state->loc += delta * state->codeAlign; + + return delta > 0; +} + +static void set_rule(uleb128_t reg, + enum item_location where, + uleb128_t value, + struct unwind_state *state) +{ + if (reg < ARRAY_SIZE(state->regs)) { + state->regs[reg].where = where; + state->regs[reg].value = value; + } +} + +static int processCFI(const u8 *start, + const u8 *end, + unsigned long targetLoc, + signed ptrType, + struct unwind_state *state) +{ + union { + const u8 *p8; + const u16 *p16; + const u32 *p32; + } ptr; + int result = 1; + + if (start != state->cieStart) { + state->loc = state->org; + result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); + if (targetLoc == 0 && state->label == NULL) + return result; + } + for (ptr.p8 = start; result && ptr.p8 < end; ) { + switch(*ptr.p8 >> 6) { + uleb128_t value; + + case 0: + switch(*ptr.p8++) { + case DW_CFA_nop: + break; + case DW_CFA_set_loc: + if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) + result = 0; + break; + case DW_CFA_advance_loc1: + result = ptr.p8 < end && advance_loc(*ptr.p8++, state); + break; + case DW_CFA_advance_loc2: + result = ptr.p8 <= end + 2 + && advance_loc(*ptr.p16++, state); + break; + case DW_CFA_advance_loc4: + result = ptr.p8 <= end + 4 + && advance_loc(*ptr.p32++, state); + break; + case DW_CFA_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_offset_extended_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_restore_extended: + case DW_CFA_undefined: + case DW_CFA_same_value: + set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); + break; + case DW_CFA_register: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Register, + get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_remember_state: + if (ptr.p8 == state->label) { + state->label = NULL; + return 1; + } + if (state->stackDepth >= MAX_STACK_DEPTH) + return 0; + state->stack[state->stackDepth++] = ptr.p8; + break; + case DW_CFA_restore_state: + if (state->stackDepth) { + const uleb128_t loc = state->loc; + const u8 *label = state->label; + + state->label = state->stack[state->stackDepth - 1]; + memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); + memset(state->regs, 0, sizeof(state->regs)); + state->stackDepth = 0; + result = processCFI(start, end, 0, ptrType, state); + state->loc = loc; + state->label = label; + } else + return 0; + break; + case DW_CFA_def_cfa: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset: + state->cfa.offs = get_uleb128(&ptr.p8, end); + break; + case DW_CFA_def_cfa_sf: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset_sf: + state->cfa.offs = get_sleb128(&ptr.p8, end) + * state->dataAlign; + break; + case DW_CFA_def_cfa_register: + state->cfa.reg = get_uleb128(&ptr.p8, end); + break; + /*todo case DW_CFA_def_cfa_expression: */ + /*todo case DW_CFA_expression: */ + /*todo case DW_CFA_val_expression: */ + case DW_CFA_GNU_args_size: + get_uleb128(&ptr.p8, end); + break; + case DW_CFA_GNU_negative_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Memory, + (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_GNU_window_save: + default: + result = 0; + break; + } + break; + case 1: + result = advance_loc(*ptr.p8++ & 0x3f, state); + break; + case 2: + value = *ptr.p8++ & 0x3f; + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case 3: + set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); + break; + } + if (ptr.p8 > end) + result = 0; + if (result && targetLoc != 0 && targetLoc < state->loc) + return 1; + } + + return result + && ptr.p8 == end + && (targetLoc == 0 + || (/*todo While in theory this should apply, gcc in practice omits + everything past the function prolog, and hence the location + never reaches the end of the function. + targetLoc < state->loc &&*/ state->label == NULL)); +} + +/* Unwind to previous to frame. Returns 0 if successful, negative + * number in case of an error. */ +int unwind(struct unwind_frame_info *frame) +{ +#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) + const u32 *fde = NULL, *cie = NULL; + const u8 *ptr = NULL, *end = NULL; + unsigned long startLoc = 0, endLoc = 0, cfa; + unsigned i; + signed ptrType = -1; + uleb128_t retAddrReg = 0; + struct unwind_table *table; + struct unwind_state state; + + if (UNW_PC(frame) == 0) + return -EINVAL; + if ((table = find_table(UNW_PC(frame))) != NULL + && !(table->size & (sizeof(*fde) - 1))) { + unsigned long tableSize = table->size; + + for (fde = table->address; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + if (!*fde || (*fde & (sizeof(*fde) - 1))) + break; + if (!fde[1]) + continue; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) + - (unsigned long)table->address) + continue; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1] + || (ptrType = fde_pointer_type(cie)) < 0) { + cie = NULL; /* this is not a (valid) CIE */ + continue; + } + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType & DW_EH_PE_indirect + ? ptrType + : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); + if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + break; + cie = NULL; + } + } + if (cie != NULL) { + memset(&state, 0, sizeof(state)); + state.cieEnd = ptr; /* keep here temporarily */ + ptr = (const u8 *)(cie + 2); + end = (const u8 *)(cie + 1) + *cie; + if ((state.version = *ptr) != 1) + cie = NULL; /* unsupported version */ + else if (*++ptr) { + /* check if augmentation size is first (and thus present) */ + if (*ptr == 'z') { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + while (++ptr < end && *ptr) + if (strchr("LPR", *ptr) == NULL) + break; + } + if (ptr >= end || *ptr) + cie = NULL; + } + ++ptr; + } + if (cie != NULL) { + /* get code aligment factor */ + state.codeAlign = get_uleb128(&ptr, end); + /* get data aligment factor */ + state.dataAlign = get_sleb128(&ptr, end); + if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) + cie = NULL; + else { + retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') + ptr += get_uleb128(&ptr, end); + if (ptr > end + || retAddrReg >= ARRAY_SIZE(reg_info) + || REG_INVALID(retAddrReg) + || reg_info[retAddrReg].width != sizeof(unsigned long)) + cie = NULL; + } + } + if (cie != NULL) { + state.cieStart = ptr; + ptr = state.cieEnd; + state.cieEnd = end; + end = (const u8 *)(fde + 1) + *fde; + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + if ((ptr += augSize) > end) + fde = NULL; + } + } + if (cie == NULL || fde == NULL) { +#ifdef CONFIG_FRAME_POINTER + unsigned long top, bottom; +#endif + +#ifdef CONFIG_FRAME_POINTER + top = STACK_TOP(frame->task); + bottom = STACK_BOTTOM(frame->task); +# if FRAME_RETADDR_OFFSET < 0 + if (UNW_SP(frame) < top + && UNW_FP(frame) <= UNW_SP(frame) + && bottom < UNW_FP(frame) +# else + if (UNW_SP(frame) > top + && UNW_FP(frame) >= UNW_SP(frame) + && bottom > UNW_FP(frame) +# endif + && !((UNW_SP(frame) | UNW_FP(frame)) + & (sizeof(unsigned long) - 1))) { + unsigned long link; + + if (!__get_user(link, + (unsigned long *)(UNW_FP(frame) + + FRAME_LINK_OFFSET)) +# if FRAME_RETADDR_OFFSET < 0 + && link > bottom && link < UNW_FP(frame) +# else + && link > UNW_FP(frame) && link < bottom +# endif + && !(link & (sizeof(link) - 1)) + && !__get_user(UNW_PC(frame), + (unsigned long *)(UNW_FP(frame) + + FRAME_RETADDR_OFFSET))) { + UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET +# if FRAME_RETADDR_OFFSET < 0 + - +# else + + +# endif + sizeof(UNW_PC(frame)); + UNW_FP(frame) = link; + return 0; + } + } +#endif + return -ENXIO; + } + state.org = startLoc; + memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); + /* process instructions */ + if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + || state.loc > endLoc + || state.regs[retAddrReg].where == Nowhere + || state.cfa.reg >= ARRAY_SIZE(reg_info) + || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || state.cfa.offs % sizeof(unsigned long)) + return -EIO; + /* update frame */ + cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; + startLoc = min((unsigned long)UNW_SP(frame), cfa); + endLoc = max((unsigned long)UNW_SP(frame), cfa); + if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { + startLoc = min(STACK_LIMIT(cfa), cfa); + endLoc = max(STACK_LIMIT(cfa), cfa); + } +#ifndef CONFIG_64BIT +# define CASES CASE(8); CASE(16); CASE(32) +#else +# define CASES CASE(8); CASE(16); CASE(32); CASE(64) +#endif + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) { + if (state.regs[i].where == Nowhere) + continue; + return -EIO; + } + switch(state.regs[i].where) { + default: + break; + case Register: + if (state.regs[i].value >= ARRAY_SIZE(reg_info) + || REG_INVALID(state.regs[i].value) + || reg_info[i].width > reg_info[state.regs[i].value].width) + return -EIO; + switch(reg_info[state.regs[i].value].width) { +#define CASE(n) \ + case sizeof(u##n): \ + state.regs[i].value = FRAME_REG(state.regs[i].value, \ + const u##n); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + } + } + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) + continue; + switch(state.regs[i].where) { + case Nowhere: + if (reg_info[i].width != sizeof(UNW_SP(frame)) + || &FRAME_REG(i, __typeof__(UNW_SP(frame))) + != &UNW_SP(frame)) + continue; + UNW_SP(frame) = cfa; + break; + case Register: + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + FRAME_REG(i, u##n) = state.regs[i].value; \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + case Value: + if (reg_info[i].width != sizeof(unsigned long)) + return -EIO; + FRAME_REG(i, unsigned long) = cfa + state.regs[i].value + * state.dataAlign; + break; + case Memory: { + unsigned long addr = cfa + state.regs[i].value + * state.dataAlign; + + if ((state.regs[i].value * state.dataAlign) + % sizeof(unsigned long) + || addr < startLoc + || addr + sizeof(unsigned long) < addr + || addr + sizeof(unsigned long) > endLoc) + return -EIO; + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + } + break; + } + } + + return 0; +#undef CASES +#undef FRAME_REG +} +EXPORT_SYMBOL(unwind); + +int unwind_init_frame_info(struct unwind_frame_info *info, + struct task_struct *tsk, + /*const*/ struct pt_regs *regs) +{ + info->task = tsk; + arch_unw_init_frame_info(info, regs); + + return 0; +} +EXPORT_SYMBOL(unwind_init_frame_info); + +/* + * Prepare to unwind a blocked task. + */ +int unwind_init_blocked(struct unwind_frame_info *info, + struct task_struct *tsk) +{ + info->task = tsk; + arch_unw_init_blocked(info); + + return 0; +} +EXPORT_SYMBOL(unwind_init_blocked); + +/* + * Prepare to unwind the currently running thread. + */ +int unwind_init_running(struct unwind_frame_info *info, + asmlinkage int (*callback)(struct unwind_frame_info *, + void *arg), + void *arg) +{ + info->task = current; + + return arch_unwind_init_running(info, callback, arg); +} +EXPORT_SYMBOL(unwind_init_running); + +/* + * Unwind until the return pointer is in user-land (or until an error + * occurs). Returns 0 if successful, negative number in case of + * error. + */ +int unwind_to_user(struct unwind_frame_info *info) +{ + while (!arch_unw_user_mode(info)) { + int err = unwind(info); + + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL(unwind_to_user); diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c..6408c042429 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) atomic_set(&new->processes, 0); atomic_set(&new->files, 0); atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER atomic_set(&new->inotify_watches, 0); atomic_set(&new->inotify_devs, 0); #endif @@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) new->mq_bytes = 0; new->locked_shm = 0; - if (alloc_uid_keyring(new) < 0) { + if (alloc_uid_keyring(new, current) < 0) { kmem_cache_free(uid_cachep, new); return NULL; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f..59f0b42bd89 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, return ret; } -int schedule_on_each_cpu(void (*func) (void *info), void *info) +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * @info: a pointer to pass to func() + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(void (*func)(void *info), void *info) { int cpu; - struct work_struct *work; + struct work_struct *works; - work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); - - if (!work) + works = alloc_percpu(struct work_struct); + if (!works) return -ENOMEM; + for_each_online_cpu(cpu) { - INIT_WORK(work + cpu, func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - work + cpu); + per_cpu_ptr(works, cpu)); } flush_workqueue(keventd_wq); - kfree(work); + free_percpu(works); return 0; } @@ -531,11 +543,11 @@ int current_is_keventd(void) static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - LIST_HEAD(list); + struct list_head list; struct work_struct *work; spin_lock_irq(&cwq->lock); - list_splice_init(&cwq->worklist, &list); + list_replace_init(&cwq->worklist, &list); while (!list_empty(&list)) { printk("Taking work for %s\n", wq->name); @@ -547,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) } /* We're holding the cpucontrol mutex here */ -static int workqueue_cpu_callback(struct notifier_block *nfb, +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { + if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + continue; /* Unbind so it can run. */ kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, any_online_cpu(cpu_online_map)); |