diff options
Diffstat (limited to 'kernel')
48 files changed, 1433 insertions, 829 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c5f081132a..188c43223f5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,7 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o -obj-$(CONFIG_SYSCTL) += sysctl_check.o +obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/audit.c b/kernel/audit.c index a7b16086d36..b7d3709cc45 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -126,6 +126,8 @@ static int audit_freelist_count; static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; +/* queue of skbs to send to auditd when/if it comes back */ +static struct sk_buff_head audit_skb_hold_queue; static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); @@ -154,6 +156,11 @@ struct audit_buffer { gfp_t gfp_mask; }; +struct audit_reply { + int pid; + struct sk_buff *skb; +}; + static void audit_set_pid(struct audit_buffer *ab, pid_t pid) { if (ab) { @@ -252,14 +259,15 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sid, int allow_changes) + uid_t loginuid, u32 sessionid, u32 sid, + int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, - old, loginuid); + audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, + old, loginuid, sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -279,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sid) + int new, uid_t loginuid, u32 sessionid, + u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -290,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change, allow_changes = 1; if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, - loginuid, sid, allow_changes); + rc = audit_log_config_change(function_name, new, old, loginuid, + sessionid, sid, allow_changes); if (rc) allow_changes = 0; } @@ -305,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) +static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sid); + limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) +static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sid); + limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sid) +static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sid); + loginuid, sessionid, sid); if (!rc) audit_ever_enabled |= !!state; @@ -332,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sid) +static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -340,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sid); + loginuid, sessionid, sid); +} + +/* + * Queue skbs to be sent to auditd when/if it comes back. These skbs should + * already have been sent via prink/syslog and so if these messages are dropped + * it is not a huge concern since we already passed the audit_log_lost() + * notification and stuff. This is just nice to get audit messages during + * boot before auditd is running or messages generated while auditd is stopped. + * This only holds messages is audit_default is set, aka booting with audit=1 + * or building your kernel that way. + */ +static void audit_hold_skb(struct sk_buff *skb) +{ + if (audit_default && + skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) + skb_queue_tail(&audit_skb_hold_queue, skb); + else + kfree_skb(skb); +} + +static void kauditd_send_skb(struct sk_buff *skb) +{ + int err; + /* take a reference in case we can't send it and we want to hold it */ + skb_get(skb); + err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + if (err < 0) { + BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd dissapeared\n"); + audit_pid = 0; + /* we might get lucky and get this in the next auditd */ + audit_hold_skb(skb); + } else + /* drop the extra reference if sent ok */ + kfree_skb(skb); } static int kauditd_thread(void *dummy) @@ -349,24 +396,41 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { + /* + * if auditd just started drain the queue of messages already + * sent to syslog/printk. remember loss here is ok. we already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * if you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ + if (audit_default && audit_pid) { + skb = skb_dequeue(&audit_skb_hold_queue); + if (unlikely(skb)) { + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + } + } + skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); if (skb) { - if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); - if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ - printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); - audit_pid = 0; - } - } else { + if (audit_pid) + kauditd_send_skb(skb); + else { if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + - NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); else audit_log_lost("printk limit exceeded\n"); - kfree_skb(skb); + + audit_hold_skb(skb); } } else { DECLARE_WAITQUEUE(wait, current); @@ -385,13 +449,13 @@ static int kauditd_thread(void *dummy) return 0; } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) +static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) { struct task_struct *tsk; int err; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); err = -ESRCH; if (!tsk) goto out; @@ -404,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) if (err) goto out; - tty_audit_push_task(tsk, loginuid); + tty_audit_push_task(tsk, loginuid, sessionid); out: read_unlock(&tasklist_lock); return err; @@ -469,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */ return NULL; } +static int audit_send_reply_thread(void *arg) +{ + struct audit_reply *reply = (struct audit_reply *)arg; + + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + /* Ignore failure. It'll only happen if the sender goes away, + because our timeout is set to infinite. */ + netlink_unicast(audit_sock, reply->skb, reply->pid, 0); + kfree(reply); + return 0; +} /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -485,14 +562,26 @@ nlmsg_failure: /* Used by NLMSG_PUT */ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { - struct sk_buff *skb; + struct sk_buff *skb; + struct task_struct *tsk; + struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), + GFP_KERNEL); + + if (!reply) + return; + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) return; - /* Ignore failure. It'll only happen if the sender goes away, - because our timeout is set to infinite. */ - netlink_unicast(audit_sock, skb, pid, 0); - return; + + reply->pid = pid; + reply->skb = skb; + + tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); + if (IS_ERR(tsk)) { + kfree(reply); + kfree_skb(skb); + } } /* @@ -534,7 +623,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 sid) + u32 pid, u32 uid, uid_t auid, u32 ses, + u32 sid) { int rc = 0; char *ctx = NULL; @@ -546,8 +636,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u", - pid, uid, auid); + audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) @@ -570,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ + u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; @@ -591,6 +682,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; loginuid = NETLINK_CB(skb).loginuid; + sessionid = NETLINK_CB(skb).sessionid; sid = NETLINK_CB(skb).sid; seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); @@ -613,12 +705,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, - loginuid, sid); + loginuid, sessionid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, - loginuid, sid); + loginuid, sessionid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_PID) { @@ -627,17 +719,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, loginuid, - sid, 1); + sessionid, sid, 1); audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sid); + loginuid, sessionid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: @@ -649,12 +741,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid); + err = audit_prepare_user_tty(pid, loginuid, + sessionid); if (err) break; } audit_log_common_recv_msg(&ab, msg_type, pid, uid, - loginuid, sid); + loginuid, sessionid, sid); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.1024s'", @@ -664,8 +757,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_format(ab, " msg="); size = nlmsg_len(nlh); - audit_log_n_untrustedstring(ab, size, - data); + audit_log_n_untrustedstring(ab, data, size); } audit_set_pid(ab, pid); audit_log_end(ab); @@ -677,7 +769,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -688,7 +780,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -696,7 +788,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -707,13 +799,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_LIST_RULES: err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); @@ -721,21 +813,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t len = nlmsg_len(nlh); + size_t msglen = nlmsg_len(nlh); char *old, *new; err = -EINVAL; - if (len < 2 * sizeof(u32)) + if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); - len -= 2 * sizeof(u32); - old = audit_unpack_string(&bufp, &len, sizes[0]); + msglen -= 2 * sizeof(u32); + old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } - new = audit_unpack_string(&bufp, &len, sizes[1]); + new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); @@ -745,7 +837,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_tag_tree(old, new); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -779,7 +871,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (!tsk) err = -ESRCH; else { @@ -802,7 +894,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (s->enabled != 0 && s->enabled != 1) return -EINVAL; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (!tsk) err = -ESRCH; else { @@ -877,6 +969,7 @@ static int __init audit_init(void) audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; skb_queue_head_init(&audit_skb_queue); + skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = 1; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; @@ -1199,7 +1292,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, +void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; @@ -1235,8 +1328,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ -static void audit_log_n_string(struct audit_buffer *ab, size_t slen, - const char *string) +void audit_log_n_string(struct audit_buffer *ab, const char *string, + size_t slen) { int avail, new_len; unsigned char *ptr; @@ -1292,13 +1385,13 @@ int audit_string_contains_control(const char *string, size_t len) * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ -void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, - const char *string) +void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, + size_t len) { if (audit_string_contains_control(string, len)) - audit_log_hex(ab, string, len); + audit_log_n_hex(ab, string, len); else - audit_log_n_string(ab, len, string); + audit_log_n_string(ab, string, len); } /** @@ -1311,7 +1404,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { - audit_log_n_untrustedstring(ab, strlen(string), string); + audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ @@ -1355,19 +1448,23 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + if (audit_pid) { - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); skb_queue_tail(&audit_skb_queue, ab->skb); - ab->skb = NULL; wake_up_interruptible(&kauditd_wait); - } else if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); + } else { + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) { + printk(KERN_NOTICE "type=%d %s\n", + nlh->nlmsg_type, + ab->skb->data + NLMSG_SPACE(0)); + } else + audit_log_lost("printk limit exceeded\n"); + } + audit_hold_skb(ab->skb); } + ab->skb = NULL; } audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 3cfc54ee3e1..9d6717412fe 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -74,6 +74,11 @@ struct audit_entry { struct audit_krule rule; }; +#ifdef CONFIG_AUDIT +extern int audit_enabled; +extern int audit_ever_enabled; +#endif + extern int audit_pid; #define AUDIT_INODE_BUCKETS 32 @@ -104,6 +109,9 @@ struct audit_netlink_list { int audit_send_list(void *); struct inotify_watch; +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + extern void audit_free_parent(struct inotify_watch *); extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, const char *, struct inode *); @@ -111,6 +119,7 @@ extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); +extern struct list_head audit_filter_list[]; #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); @@ -137,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *); extern char *audit_unpack_string(void **, size_t *, size_t); +extern pid_t audit_sig_pid; +extern uid_t audit_sig_uid; +extern u32 audit_sig_sid; + #ifdef CONFIG_AUDITSYSCALL extern int __audit_signal_info(int sig, struct task_struct *t); static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 28fef6bf853..0e0bd27e651 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); -/* Inotify handle */ -extern struct inotify_handle *audit_ih; - /* Inotify events we care about. */ #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF -extern int audit_enabled; - void audit_free_parent(struct inotify_watch *i_watch) { struct audit_parent *parent; @@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, return -EINVAL; watch = audit_init_watch(path); - if (unlikely(IS_ERR(watch))) + if (IS_ERR(watch)) return PTR_ERR(watch); audit_get_watch(watch); @@ -422,7 +417,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *f; + struct audit_field *ino_f; int err = 0; int i; @@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) if (f->val & ~15) goto exit_free; break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } - f = entry->rule.inode_f; - if (f) { - switch(f->op) { + ino_f = entry->rule.inode_f; + if (ino_f) { + switch(ino_f->op) { case AUDIT_NOT_EQUAL: entry->rule.inode_f = NULL; case AUDIT_EQUAL: @@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *f; + struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (f->val & ~15) goto exit_free; break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; default: goto exit_free; } } - f = entry->rule.inode_f; - if (f) { - switch(f->op) { + ino_f = entry->rule.inode_f; + if (ino_f) { + switch(ino_f->op) { case AUDIT_NOT_EQUAL: entry->rule.inode_f = NULL; case AUDIT_EQUAL: @@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) return ERR_PTR(-ENOMEM); new = audit_init_watch(path); - if (unlikely(IS_ERR(new))) { + if (IS_ERR(new)) { kfree(path); goto out; } @@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent, audit_set_auditable(current->audit_context); nwatch = audit_dupe_watch(owatch); - if (unlikely(IS_ERR(nwatch))) { + if (IS_ERR(nwatch)) { mutex_unlock(&audit_filter_mutex); audit_panic("error updating watch, skipping"); return; @@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (unlikely(IS_ERR(nentry))) + if (IS_ERR(nentry)) audit_panic("error updating watch, removing"); else { int h = audit_hash_ino((u32)ino); @@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, - struct audit_krule *rule, int res) +static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, + char *action, struct audit_krule *rule, + int res) { struct audit_buffer *ab; @@ -1511,7 +1515,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u", loginuid); + audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); if (sid) { char *ctx = NULL; u32 len; @@ -1543,7 +1547,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sid) + size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1590,7 +1594,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); + audit_log_rule_change(loginuid, sessionid, sid, "add", + &entry->rule, !err); if (err) audit_free_rule(entry); @@ -1606,8 +1611,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sid, "remove", &entry->rule, - !err); + audit_log_rule_change(loginuid, sessionid, sid, "remove", + &entry->rule, !err); audit_free_rule(entry); break; @@ -1785,7 +1790,7 @@ int audit_update_lsm_rules(void) watch = entry->rule.watch; tree = entry->rule.tree; nentry = audit_dupe_rule(&entry->rule, watch); - if (unlikely(IS_ERR(nentry))) { + if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ if (!err) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 56e56ed594a..c10e7aae04d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -68,9 +68,6 @@ #include "audit.h" -extern struct list_head audit_filter_list[]; -extern int audit_ever_enabled; - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). */ #define AUDIT_NAMES 20 @@ -283,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } +static int audit_match_filetype(struct audit_context *ctx, int which) +{ + unsigned index = which & ~S_IFMT; + mode_t mode = which & S_IFMT; + if (index >= ctx->name_count) + return 0; + if (ctx->names[index].ino == -1) + return 0; + if ((ctx->names[index].mode ^ mode) & S_IFMT) + return 0; + return 1; +} + /* * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; * ->first_trees points to its beginning, ->trees - to the current end of data. @@ -592,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PERM: result = audit_match_perm(ctx, f->val); break; + case AUDIT_FILETYPE: + result = audit_match_filetype(ctx, f->val); + break; } if (!result) @@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, audit_log_format(*ab, "[%d]", i); audit_log_format(*ab, "="); if (has_cntl) - audit_log_hex(*ab, buf, to_send); + audit_log_n_hex(*ab, buf, to_send); else audit_log_format(*ab, "\"%s\"", buf); audit_log_format(*ab, "\n"); @@ -1296,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts break; } case AUDIT_SOCKETCALL: { - int i; struct audit_aux_data_socketcall *axs = (void *)aux; audit_log_format(ab, "nargs=%d", axs->nargs); for (i=0; i<axs->nargs; i++) @@ -1307,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_aux_data_sockaddr *axs = (void *)aux; audit_log_format(ab, "saddr="); - audit_log_hex(ab, axs->a, axs->len); + audit_log_n_hex(ab, axs->a, axs->len); break; } case AUDIT_FD_PAIR: { @@ -1321,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; - int i; for (i = 0; i < axs->pid_count; i++) if (audit_log_pid_context(context, axs->target_pid[i], @@ -1371,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name_len, - n->name); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); } } else audit_log_format(ab, " name=(null)"); @@ -1596,7 +1607,7 @@ static inline void handle_one(const struct inode *inode) if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference"); + printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); @@ -1656,7 +1667,7 @@ retry: } /* too bad */ printk(KERN_WARNING - "out of memory, audit has lost a tree reference"); + "out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; @@ -1752,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context, if (context->name_count >= AUDIT_NAMES) { if (inode) printk(KERN_DEBUG "name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu", + "dev=%02x:%02x, inode=%lu\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); else - printk(KERN_DEBUG "name_count maxed, losing inode data"); + printk(KERN_DEBUG "name_count maxed, losing inode data\n"); return 1; } context->name_count++; @@ -2361,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; - extern pid_t audit_sig_pid; - extern uid_t audit_sig_uid; - extern u32 audit_sig_sid; if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 00000000000..3c530138183 --- /dev/null +++ b/kernel/bounds.c @@ -0,0 +1,19 @@ +/* + * Generate definitions needed by the preprocessor. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#define __GENERATING_BOUNDS_H +/* Include headers that define the enum constants of interest */ +#include <linux/page-flags.h> +#include <linux/mmzone.h> +#include <linux/kbuild.h> + +void foo(void) +{ + /* The enum constants to put into include/linux/bounds.h */ + DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); + DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + /* End of constants */ +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6d8de051382..b9d467d83fc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -44,6 +44,7 @@ #include <linux/kmod.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> +#include <linux/hash.h> #include <asm/atomic.h> @@ -118,17 +119,7 @@ static int root_count; * be called. */ static int need_forkexit_callback; - -/* bits in struct cgroup flags field */ -enum { - /* Control Group is dead */ - CGRP_REMOVED, - /* Control Group has previously had a child cgroup or a task, - * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ - CGRP_RELEASABLE, - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, -}; +static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link; static DEFINE_RWLOCK(css_set_lock); static int css_set_count; +/* hash table for cgroup groups. This improves the performance to + * find an existing css_set */ +#define CSS_SET_HASH_BITS 7 +#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) +static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; + +static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +{ + int i; + int index; + unsigned long tmp = 0UL; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + tmp += (unsigned long)css[i]; + tmp = (tmp >> 16) ^ tmp; + + index = hash_long(tmp, CSS_SET_HASH_BITS); + + return &css_set_table[index]; +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -230,7 +242,7 @@ static int use_task_css_set_links; static void unlink_css_set(struct css_set *cg) { write_lock(&css_set_lock); - list_del(&cg->list); + hlist_del(&cg->hlist); css_set_count--; while (!list_empty(&cg->cg_links)) { struct cg_cgroup_link *link; @@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg) /* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing - * css_set is suitable. This currently walks a linked-list for - * simplicity; a later patch will use a hash table for better - * performance + * css_set is suitable. * * oldcg: the cgroup group that we're using before the cgroup * transition @@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct list_head *l = &init_css_set.list; + struct hlist_head *hhead; + struct hlist_node *node; + struct css_set *cg; /* Built the set of subsystem state objects that we want to * see in the new css_set */ @@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set( } } - /* Look through existing cgroup groups to find one to reuse */ - do { - struct css_set *cg = - list_entry(l, struct css_set, list); - + hhead = css_set_hash(template); + hlist_for_each_entry(cg, node, hhead, hlist) { if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { /* All subsystems matched */ return cg; } - /* Try the next cgroup group */ - l = l->next; - } while (l != &init_css_set.list); + } /* No existing cgroup group matched */ return NULL; @@ -404,6 +411,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; struct cg_cgroup_link *link; + struct hlist_head *hhead; + /* First see if we already have a cgroup group that matches * the desired set */ write_lock(&css_set_lock); @@ -428,6 +437,7 @@ static struct css_set *find_css_set( kref_init(&res->ref); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); + INIT_HLIST_NODE(&res->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -467,9 +477,12 @@ static struct css_set *find_css_set( BUG_ON(!list_empty(&tmp_cg_links)); - /* Link this cgroup group into the list */ - list_add(&res->list, &init_css_set.list); css_set_count++; + + /* Add this cgroup group to the hash table */ + hhead = css_set_hash(res->subsys); + hlist_add_head(&res->hlist, hhead); + write_unlock(&css_set_lock); return res; @@ -948,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *root; - struct list_head tmp_cg_links, *l; + struct list_head tmp_cg_links; INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ @@ -990,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, /* New superblock */ struct cgroup *cgrp = &root->top_cgroup; struct inode *inode; + int i; BUG_ON(sb->s_root != NULL); @@ -1034,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - l = &init_css_set.list; - do { + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct hlist_head *hhead = &css_set_table[i]; + struct hlist_node *node; struct css_set *cg; - struct cg_cgroup_link *link; - cg = list_entry(l, struct css_set, list); - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - l = l->next; - } while (l != &init_css_set.list); + + hlist_for_each_entry(cg, node, hhead, hlist) { + struct cg_cgroup_link *link; + + BUG_ON(list_empty(&tmp_cg_links)); + link = list_entry(tmp_cg_links.next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + link->cg = cg; + list_add(&link->cgrp_link_list, + &root->top_cgroup.css_sets); + list_add(&link->cg_link_list, &cg->cg_links); + } + } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1307,18 +1324,16 @@ enum cgroup_filetype { FILE_DIR, FILE_TASKLIST, FILE_NOTIFY_ON_RELEASE, - FILE_RELEASABLE, FILE_RELEASE_AGENT, }; -static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { char buffer[64]; int retval = 0; - u64 val; char *end; if (!nbytes) @@ -1329,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoull(buffer, &end, 0); - if (*end) - return -EINVAL; - - /* Pass to subsystem */ - retval = cft->write_uint(cgrp, cft, val); + strstrip(buffer); + if (cft->write_u64) { + u64 val = simple_strtoull(buffer, &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_u64(cgrp, cft, val); + } else { + s64 val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_s64(cgrp, cft, val); + } if (!retval) retval = nbytes; return retval; @@ -1419,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_uint) - return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_u64 || cft->write_s64) + return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->trigger) { + int ret = cft->trigger(cgrp, (unsigned int)cft->private); + return ret ? ret : nbytes; + } return -EINVAL; } -static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { char tmp[64]; - u64 val = cft->read_uint(cgrp, cft); + u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } +static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[64]; + s64 val = cft->read_s64(cgrp, cft); + int len = sprintf(tmp, "%lld\n", (long long) val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + static ssize_t cgroup_common_file_read(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -1490,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_uint) - return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_u64) + return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_s64) + return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + if (cft->read_map) { + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + return cft->read_map(state->cgroup, cft, &cb); + } + return cft->read_seq_string(state->cgroup, cft, m); +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1507,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map || cft->read_seq_string) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; @@ -1715,7 +1804,7 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * The tasklist_lock is not held here, as do_each_thread() and * while_each_thread() are protected by RCU. */ -void cgroup_enable_task_cg_lists(void) +static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); @@ -1913,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) if (heap->size) { for (i = 0; i < heap->size; i++) { - struct task_struct *p = heap->ptrs[i]; + struct task_struct *q = heap->ptrs[i]; if (i == 0) { - latest_time = p->start_time; - latest_task = p; + latest_time = q->start_time; + latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(p, scan); - put_task_struct(p); + scan->process_task(q, scan); + put_task_struct(q); } /* * If we had to process any tasks at all, scan again @@ -2138,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, return notify_on_release(cgrp); } -static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - /* * for the common functions, 'private' gives the type of file */ @@ -2158,16 +2242,10 @@ static struct cftype files[] = { { .name = "notify_on_release", - .read_uint = cgroup_read_notify_on_release, + .read_u64 = cgroup_read_notify_on_release, .write = cgroup_common_file_write, .private = FILE_NOTIFY_ON_RELEASE, }, - - { - .name = "releasable", - .read_uint = cgroup_read_releasable, - .private = FILE_RELEASABLE, - } }; static struct cftype cft_release_agent = { @@ -2401,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return 0; } -static void cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; - struct list_head *l; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); @@ -2415,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); - /* Update all cgroup groups to contain a subsys + /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is - * newly registered, all tasks and hence all cgroup - * groups are in the subsystem's top cgroup. */ - write_lock(&css_set_lock); - l = &init_css_set.list; - do { - struct css_set *cg = - list_entry(l, struct css_set, list); - cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; - l = l->next; - } while (l != &init_css_set.list); - write_unlock(&css_set_lock); - - /* If this subsystem requested that it be notified with fork - * events, we should send it one now for every process in the - * system */ - if (ss->fork) { - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - ss->fork(ss, p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - } + * newly registered, all tasks and hence the + * init_css_set is in the subsystem's top cgroup. */ + init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; + need_mm_owner_callback |= !!ss->mm_owner_changed; + + /* At system boot, before all subsystems have been + * registered, no tasks have been forked, so we don't + * need to invoke fork callbacks here. */ + BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; } @@ -2458,9 +2520,9 @@ int __init cgroup_init_early(void) int i; kref_init(&init_css_set.ref); kref_get(&init_css_set.ref); - INIT_LIST_HEAD(&init_css_set.list); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); + INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); list_add(&rootnode.root_list, &roots); @@ -2473,6 +2535,9 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&css_set_table[i]); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -2502,7 +2567,7 @@ int __init cgroup_init(void) { int err; int i; - struct proc_dir_entry *entry; + struct hlist_head *hhead; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -2514,13 +2579,15 @@ int __init cgroup_init(void) cgroup_init_subsys(ss); } + /* Add init_css_set to the hash table */ + hhead = css_set_hash(init_css_set.subsys); + hlist_add_head(&init_css_set.hlist, hhead); + err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; - entry = create_proc_entry("cgroups", 0, NULL); - if (entry) - entry->proc_fops = &proc_cgroupstats_operations; + proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); out: if (err) @@ -2683,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child) } } +#ifdef CONFIG_MM_OWNER +/** + * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes + * @p: the new owner + * + * Called on every change to mm->owner. mm_init_owner() does not + * invoke this routine, since it assigns the mm->owner the first time + * and does not change it. + */ +void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) +{ + struct cgroup *oldcgrp, *newcgrp; + + if (need_mm_owner_callback) { + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + oldcgrp = task_cgroup(old, ss->subsys_id); + newcgrp = task_cgroup(new, ss->subsys_id); + if (oldcgrp == newcgrp) + continue; + if (ss->mm_owner_changed) + ss->mm_owner_changed(ss, oldcgrp, newcgrp); + } + } +} +#endif /* CONFIG_MM_OWNER */ + /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index 37301e877cb..c3dc3aba4c0 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -1,5 +1,5 @@ /* - * kernel/ccontainer_debug.c - Example cgroup subsystem that + * kernel/cgroup_debug.c - Example cgroup subsystem that * exposes debug info * * Copyright (C) Google Inc, 2007 @@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, return count; } +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + static struct cftype files[] = { { .name = "cgroup_refcount", - .read_uint = cgroup_refcount_read, + .read_u64 = cgroup_refcount_read, }, { .name = "taskcount", - .read_uint = taskcount_read, + .read_u64 = taskcount_read, }, { .name = "current_css_set", - .read_uint = current_css_set_read, + .read_u64 = current_css_set_read, }, { .name = "current_css_set_refcount", - .read_uint = current_css_set_refcount_read, + .read_u64 = current_css_set_refcount_read, }, + + { + .name = "releasable", + .read_u64 = releasable_read, + } }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/configs.c b/kernel/configs.c index e84d3f9c6c7..4c345210ed8 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -79,12 +79,11 @@ static int __init ikconfig_init(void) struct proc_dir_entry *entry; /* create the current config file */ - entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, - &proc_root); + entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, + &ikconfig_file_ops); if (!entry) return -ENOMEM; - entry->proc_fops = &ikconfig_file_ops; entry->size = kernel_config_data_size; return 0; @@ -95,7 +94,7 @@ static int __init ikconfig_init(void) static void __exit ikconfig_cleanup(void) { - remove_proc_entry("config.gz", &proc_root); + remove_proc_entry("config.gz", NULL); } module_init(ikconfig_init); diff --git a/kernel/cpu.c b/kernel/cpu.c index 2011ad8d269..a98f6ab16ec 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -33,17 +33,13 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; - wait_queue_head_t writer_queue; } cpu_hotplug; -#define writer_exists() (cpu_hotplug.active_writer != NULL) - void __init cpu_hotplug_init(void) { cpu_hotplug.active_writer = NULL; mutex_init(&cpu_hotplug.lock); cpu_hotplug.refcount = 0; - init_waitqueue_head(&cpu_hotplug.writer_queue); } #ifdef CONFIG_HOTPLUG_CPU @@ -65,11 +61,8 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.refcount--; - - if (unlikely(writer_exists()) && !cpu_hotplug.refcount) - wake_up(&cpu_hotplug.writer_queue); - + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) + wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); } @@ -98,8 +91,8 @@ void cpu_maps_update_done(void) * Note that during a cpu-hotplug operation, the new readers, if any, * will be blocked by the cpu_hotplug.lock * - * Since cpu_maps_update_begin is always called after invoking - * cpu_maps_update_begin, we can be sure that only one writer is active. + * Since cpu_hotplug_begin() is always called after invoking + * cpu_maps_update_begin(), we can be sure that only one writer is active. * * Note that theoretically, there is a possibility of a livelock: * - Refcount goes to zero, last reader wakes up the sleeping @@ -115,19 +108,16 @@ void cpu_maps_update_done(void) */ static void cpu_hotplug_begin(void) { - DECLARE_WAITQUEUE(wait, current); - - mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.active_writer = current; - add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); - while (cpu_hotplug.refcount) { - set_current_state(TASK_UNINTERRUPTIBLE); + + for (;;) { + mutex_lock(&cpu_hotplug.lock); + if (likely(!cpu_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); mutex_unlock(&cpu_hotplug.lock); schedule(); - mutex_lock(&cpu_hotplug.lock); } - remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); } static void cpu_hotplug_done(void) @@ -136,7 +126,7 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } /* Need to know about CPUs going up/down? */ -int __cpuinit register_cpu_notifier(struct notifier_block *nb) +int __ref register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb) EXPORT_SYMBOL(register_cpu_notifier); -void unregister_cpu_notifier(struct notifier_block *nb) +void __ref unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -180,7 +170,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int take_cpu_down(void *_param) +static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -199,7 +189,7 @@ static int take_cpu_down(void *_param) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; struct task_struct *p; @@ -274,7 +264,7 @@ out_release: return err; } -int cpu_down(unsigned int cpu) +int __ref cpu_down(unsigned int cpu) { int err = 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8b35fbd8292..8da627d3380 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner { typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, @@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs) return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); @@ -735,7 +741,8 @@ static inline int started_after(void *p1, void *p2) * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). */ -int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +static int cpuset_test_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) { return !cpus_equal(tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); @@ -752,7 +759,8 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cgroup_lock() at this point. */ -void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) { set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); } @@ -941,7 +949,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); - cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ + cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ @@ -992,7 +1000,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * rebind the vma mempolicies of each mm in mmarray[] to their * new cpuset, and release that mm. The mpol_rebind_mm() * call takes mmap_sem, which we couldn't take while holding - * tasklist_lock. Forks can happen again now - the mpol_copy() + * tasklist_lock. Forks can happen again now - the mpol_dup() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global * cgroup_mutex, we know that no other rebind effort will @@ -1023,19 +1031,6 @@ int current_cpuset_is_being_rebound(void) return task_cs(current) == cpuset_being_rebound; } -/* - * Call with cgroup_mutex held. - */ - -static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) -{ - if (simple_strtoul(buf, NULL, 10) != 0) - cpuset_memory_pressure_enabled = 1; - else - cpuset_memory_pressure_enabled = 0; - return 0; -} - static int update_relax_domain_level(struct cpuset *cs, char *buf) { int val = simple_strtol(buf, NULL, 10); @@ -1053,25 +1048,20 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag - * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_SCHED_LOAD_BALANCE, - * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, - * CS_SPREAD_PAGE, CS_SPREAD_SLAB) - * cs: the cpuset to update - * buf: the buffer where we read the 0 or 1 + * bit: the bit to update (see cpuset_flagbits_t) + * cs: the cpuset to update + * turning_on: whether the flag is being set or cleared * * Call with cgroup_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) { - int turning_on; struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; - turning_on = (simple_strtoul(buf, NULL, 10) != 0); - trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); @@ -1241,6 +1231,7 @@ typedef enum { FILE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, @@ -1265,7 +1256,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, return -E2BIG; /* +1 for nul-terminator */ - if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buffer) return -ENOMEM; if (copy_from_user(buffer, userbuf, nbytes)) { @@ -1288,46 +1280,71 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, case FILE_MEMLIST: retval = update_nodemask(cs, buffer); break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, buffer); + break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + cgroup_unlock(); +out1: + kfree(buffer); + return retval; +} + +static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + + switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); break; - case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); + case FILE_MEM_HARDWALL: + retval = update_flag(CS_MEM_HARDWALL, cs, val); break; - case FILE_SCHED_RELAX_DOMAIN_LEVEL: - retval = update_relax_domain_level(cs, buffer); + case FILE_SCHED_LOAD_BALANCE: + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: - retval = update_memory_pressure_enabled(cs, buffer); + cpuset_memory_pressure_enabled = !!val; break; case FILE_MEMORY_PRESSURE: retval = -EACCES; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + retval = update_flag(CS_SPREAD_PAGE, cs, val); cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + retval = update_flag(CS_SPREAD_SLAB, cs, val); cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; - goto out2; + break; } - - if (retval == 0) - retval = nbytes; -out2: cgroup_unlock(); -out1: - kfree(buffer); return retval; } @@ -1389,33 +1406,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; - case FILE_CPU_EXCLUSIVE: - *s++ = is_cpu_exclusive(cs) ? '1' : '0'; - break; - case FILE_MEM_EXCLUSIVE: - *s++ = is_mem_exclusive(cs) ? '1' : '0'; - break; - case FILE_SCHED_LOAD_BALANCE: - *s++ = is_sched_load_balance(cs) ? '1' : '0'; - break; case FILE_SCHED_RELAX_DOMAIN_LEVEL: s += sprintf(s, "%d", cs->relax_domain_level); break; - case FILE_MEMORY_MIGRATE: - *s++ = is_memory_migrate(cs) ? '1' : '0'; - break; - case FILE_MEMORY_PRESSURE_ENABLED: - *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; - break; - case FILE_MEMORY_PRESSURE: - s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); - break; - case FILE_SPREAD_PAGE: - *s++ = is_spread_page(cs) ? '1' : '0'; - break; - case FILE_SPREAD_SLAB: - *s++ = is_spread_slab(cs) ? '1' : '0'; - break; default: retval = -EINVAL; goto out; @@ -1428,121 +1421,137 @@ out: return retval; } - - +static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } +} /* * for the common functions, 'private' gives the type of file */ -static struct cftype cft_cpus = { - .name = "cpus", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_CPULIST, -}; - -static struct cftype cft_mems = { - .name = "mems", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMLIST, -}; - -static struct cftype cft_cpu_exclusive = { - .name = "cpu_exclusive", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_CPU_EXCLUSIVE, -}; - -static struct cftype cft_mem_exclusive = { - .name = "mem_exclusive", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEM_EXCLUSIVE, -}; - -static struct cftype cft_sched_load_balance = { - .name = "sched_load_balance", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SCHED_LOAD_BALANCE, -}; - -static struct cftype cft_sched_relax_domain_level = { - .name = "sched_relax_domain_level", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, -}; - -static struct cftype cft_memory_migrate = { - .name = "memory_migrate", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMORY_MIGRATE, +static struct cftype files[] = { + { + .name = "cpus", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, }; static struct cftype cft_memory_pressure_enabled = { .name = "memory_pressure_enabled", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }; -static struct cftype cft_memory_pressure = { - .name = "memory_pressure", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMORY_PRESSURE, -}; - -static struct cftype cft_spread_page = { - .name = "memory_spread_page", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SPREAD_PAGE, -}; - -static struct cftype cft_spread_slab = { - .name = "memory_spread_slab", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SPREAD_SLAB, -}; - static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; - if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, - &cft_sched_relax_domain_level)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) + err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + if (err) return err; /* memory_pressure_enabled is in root cpuset only */ - if (err == 0 && !cont->parent) + if (!cont->parent) err = cgroup_add_file(cont, ss, - &cft_memory_pressure_enabled); - return 0; + &cft_memory_pressure_enabled); + return err; } /* @@ -1642,7 +1651,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) cpuset_update_task_memory_state(); if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; kfree(cs); @@ -1707,7 +1716,8 @@ int __init cpuset_init(void) * Called by cgroup_scan_tasks() for each task in a cgroup. * Return nonzero to stop the walk through the tasks. */ -void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) +static void cpuset_do_move_task(struct task_struct *tsk, + struct cgroup_scanner *scan) { struct cpuset_hotplug_scanner *chsp; @@ -1958,33 +1968,25 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) } /** - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed - * @zl: the zonelist to be checked + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * @nodemask: the nodemask to be checked * - * Are any of the nodes on zonelist zl allowed in current->mems_allowed? + * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { - int i; - - for (i = 0; zl->zones[i]; i++) { - int nid = zone_to_nid(zl->zones[i]); - - if (node_isset(nid, current->mems_allowed)) - return 1; - } - return 0; + return nodes_intersects(*nodemask, current->mems_allowed); } /* - * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call holding callback_mutex. - * If no ancestor is mem_exclusive (an unusual configuration), then - * returns the root cpuset. + * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or + * mem_hardwall ancestor to the specified cpuset. Call holding + * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!is_mem_exclusive(cs) && cs->parent) + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) cs = cs->parent; return cs; } @@ -1998,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest - * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * hardwalled cpuset ancestor to this tasks cpuset, yes. * If the task has been OOM killed and has access to memory reserves * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. @@ -2021,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest enclosing mem_exclusive ancestor cpuset. + * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_mutex. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit @@ -2044,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * TIF_MEMDIE - any node ok - * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: @@ -2081,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) mutex_lock(&callback_mutex); task_lock(current); - cs = nearest_exclusive_ancestor(task_cs(current)); + cs = nearest_hardwall_ancestor(task_cs(current)); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); diff --git a/kernel/dma.c b/kernel/dma.c index 6a82bb716da..d2c60a82279 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = { static int __init proc_dma_init(void) { - struct proc_dir_entry *e; - - e = create_proc_entry("dma", 0, NULL); - if (e) - e->proc_fops = &proc_dma_operations; - + proc_create("dma", 0, NULL, &proc_dma_operations); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 97f609f574b..ae0f2c4e452 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk) EXPORT_SYMBOL_GPL(exit_fs); +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) +{ + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (!mm) + return 0; + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; +} + +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + cgroup_mm_owner_callbacks(mm->owner, c); + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ + /* * Turn us into a lazy TLB process if we * aren't already.. @@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk) /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); task_unlock(tsk); + mm_update_next_owner(mm); mmput(mm); } @@ -967,7 +1050,7 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA - mpol_free(tsk->mempolicy); + mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX diff --git a/kernel/fork.c b/kernel/fork.c index c674aa8d3c3..068ffe00752 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -279,7 +279,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; - pol = mpol_copy(vma_policy(mpnt)); + pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; @@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_cgroup(mm, p); + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } - mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -432,13 +431,13 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } put_swap_token(mm); - mm_free_cgroup(mm); mmdrop(mm); } } @@ -545,6 +544,8 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (init_new_context(tsk, mm)) goto fail_nocontext; + dup_mm_exe_file(oldmm, mm); + err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -982,6 +983,13 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1116,7 +1124,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); + p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; @@ -1374,7 +1382,7 @@ bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA - mpol_free(p->mempolicy); + mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); @@ -1664,18 +1672,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp } /* - * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not - * supported yet - */ -static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) -{ - if (unshare_flags & CLONE_SYSVSEM) - return -EINVAL; - - return 0; -} - -/* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by do_fork() cannot be used here directly @@ -1690,8 +1686,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; - struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; check_unshare_flags(&unshare_flags); @@ -1703,6 +1699,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) + do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) @@ -1713,13 +1716,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; - if ((err = unshare_semundo(unshare_flags, &new_ulist))) - goto bad_unshare_cleanup_fd; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); @@ -1755,7 +1762,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f78777abe76..dea4c9124ac 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -590,7 +590,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); timer->state = HRTIMER_STATE_PENDING; - raise_softirq(HRTIMER_SOFTIRQ); return 1; default: BUG(); @@ -633,6 +632,11 @@ static int hrtimer_switch_to_hres(void) return 1; } +static inline void hrtimer_raise_softirq(void) +{ + raise_softirq(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -651,6 +655,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } +static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -850,7 +855,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, raise; base = lock_hrtimer_base(timer, &flags); @@ -884,8 +889,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + /* + * The timer may be expired and moved to the cb_pending + * list. We can not raise the softirq with base lock held due + * to a possible deadlock with runqueue lock. + */ + raise = timer->state == HRTIMER_STATE_PENDING; + unlock_hrtimer_base(timer, &flags); + if (raise) + hrtimer_raise_softirq(); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_start); @@ -1080,8 +1095,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) * If the timer was rearmed on another CPU, reprogram * the event device. */ - if (timer->base->first == &timer->node) - hrtimer_reprogram(timer, timer->base); + struct hrtimer_clock_base *base = timer->base; + + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) { + /* + * Timer is expired. Thus move it from tree to + * pending list again. + */ + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + } } } spin_unlock_irq(&cpu_base->lock); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 6d9204f3a37..38a25b8d8bf 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,6 +1,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/device.h> +#include <linux/gfp.h> /* * Device resource management aware IRQ request/free implementation. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 438a0146428..46e4ad1723f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> +#include <linux/slab.h> #include "internals.h" diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f091d13def0..6fc0040f3e3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = { static int __init kallsyms_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("kallsyms", 0444, NULL); - if (entry) - entry->proc_fops = &kallsyms_operations; + proc_create("kallsyms", 0444, NULL, &kallsyms_operations); return 0; } __initcall(kallsyms_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 6782dce93d0..cb85c79989b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1405,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); arch_crash_save_vmcoreinfo(); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fcfb580c3af..1e0250cb948 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +/* + * Normally, functions that we'd want to prohibit kprobes in, are marked + * __kprobes. But, there are cases where such functions already belong to + * a different section (__sched for preempt_schedule) + * + * For such cases, we now have a blacklist + */ +struct kprobe_blackpoint kprobe_blacklist[] = { + {"preempt_schedule",}, + {NULL} /* Terminator */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp) } } +static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +{ + unsigned long flags; + struct kretprobe_instance *ri; + struct hlist_node *pos, *next; + /* No race here */ + spin_lock_irqsave(&kretprobe_lock, flags); + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { + ri->rp = NULL; + hlist_del(&ri->uflist); + } + spin_unlock_irqrestore(&kretprobe_lock, flags); + free_rp_inst(rp); +} + /* * Keep all fields in the kprobe consistent */ @@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { + struct kprobe_blackpoint *kb; + if (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) return -EINVAL; + /* + * If there exists a kprobe_blacklist, verify and + * fail any probe registration in the prohibited area + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + if (kb->start_addr) { + if (addr >= kb->start_addr && + addr < (kb->start_addr + kb->range)) + return -EINVAL; + } + } return 0; } @@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, } p->nmissed = 0; + INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -581,35 +622,28 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kprobe(struct kprobe *p) +/* + * Unregister a kprobe without a scheduler synchronization. + */ +static int __kprobes __unregister_kprobe_top(struct kprobe *p) { - struct module *mod; struct kprobe *old_p, *list_p; - int cleanup_p; - mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) { - mutex_unlock(&kprobe_mutex); - return; - } + if (unlikely(!old_p)) + return -EINVAL; + if (p != old_p) { list_for_each_entry_rcu(list_p, &old_p->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid_p; - mutex_unlock(&kprobe_mutex); - return; + return -EINVAL; } valid_p: if (old_p == p || (old_p->pre_handler == aggr_pre_handler && - p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are * enabled - otherwise, the breakpoint would already have @@ -618,43 +652,97 @@ valid_p: if (kprobe_enabled) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); - cleanup_p = 1; } else { + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler) { + list_for_each_entry_rcu(list_p, &old_p->list, list) { + if ((list_p != p) && (list_p->post_handler)) + goto noclean; + } + old_p->post_handler = NULL; + } +noclean: list_del_rcu(&p->list); - cleanup_p = 0; } + return 0; +} - mutex_unlock(&kprobe_mutex); +static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +{ + struct module *mod; + struct kprobe *old_p; - synchronize_sched(); if (p->mod_refcounted) { mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); } - if (cleanup_p) { - if (p != old_p) { - list_del_rcu(&p->list); + if (list_empty(&p->list) || list_is_singular(&p->list)) { + if (!list_empty(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); kfree(old_p); } arch_remove_kprobe(p); - } else { - mutex_lock(&kprobe_mutex); - if (p->break_handler) - old_p->break_handler = NULL; - if (p->post_handler){ - list_for_each_entry_rcu(list_p, &old_p->list, list){ - if (list_p->post_handler){ - cleanup_p = 2; - break; - } - } - if (cleanup_p == 0) - old_p->post_handler = NULL; + } +} + +static int __register_kprobes(struct kprobe **kps, int num, + unsigned long called_from) +{ + int i, ret = 0; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kprobe(kps[i], called_from); + if (ret < 0 && i > 0) { + unregister_kprobes(kps, i); + break; } - mutex_unlock(&kprobe_mutex); } + return ret; +} + +/* + * Registration and unregistration functions for kprobe. + */ +int __kprobes register_kprobe(struct kprobe *p) +{ + return __register_kprobes(&p, 1, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobe(struct kprobe *p) +{ + unregister_kprobes(&p, 1); +} + +int __kprobes register_kprobes(struct kprobe **kps, int num) +{ + return __register_kprobes(kps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobes(struct kprobe **kps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) + if (kps[i]->addr) + __unregister_kprobe_bottom(kps[i]); } static struct notifier_block kprobe_exceptions_nb = { @@ -667,24 +755,69 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobe(struct jprobe *jp) +static int __register_jprobes(struct jprobe **jps, int num, + unsigned long called_from) { - unsigned long addr = arch_deref_entry_point(jp->entry); + struct jprobe *jp; + int ret = 0, i; - if (!kernel_text_address(addr)) + if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { + unsigned long addr; + jp = jps[i]; + addr = arch_deref_entry_point(jp->entry); + + if (!kernel_text_address(addr)) + ret = -EINVAL; + else { + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = __register_kprobe(&jp->kp, called_from); + } + if (ret < 0 && i > 0) { + unregister_jprobes(jps, i); + break; + } + } + return ret; +} - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - - return __register_kprobe(&jp->kp, +int __kprobes register_jprobe(struct jprobe *jp) +{ + return __register_jprobes(&jp, 1, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_jprobe(struct jprobe *jp) { - unregister_kprobe(&jp->kp); + unregister_jprobes(&jp, 1); +} + +int __kprobes register_jprobes(struct jprobe **jps, int num) +{ + return __register_jprobes(jps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_jprobes(struct jprobe **jps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&jps[i]->kp) < 0) + jps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (jps[i]->kp.addr) + __unregister_kprobe_bottom(&jps[i]->kp); + } } #ifdef CONFIG_KRETPROBES @@ -725,7 +858,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -int __kprobes register_kretprobe(struct kretprobe *rp) +static int __kprobes __register_kretprobe(struct kretprobe *rp, + unsigned long called_from) { int ret = 0; struct kretprobe_instance *inst; @@ -771,46 +905,101 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->nmissed = 0; /* Establish function entry probe point */ - if ((ret = __register_kprobe(&rp->kp, - (unsigned long)__builtin_return_address(0))) != 0) + ret = __register_kprobe(&rp->kp, called_from); + if (ret != 0) free_rp_inst(rp); return ret; } +static int __register_kretprobes(struct kretprobe **rps, int num, + unsigned long called_from) +{ + int ret = 0, i; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kretprobe(rps[i], called_from); + if (ret < 0 && i > 0) { + unregister_kretprobes(rps, i); + break; + } + } + return ret; +} + +int __kprobes register_kretprobe(struct kretprobe *rp) +{ + return __register_kretprobes(&rp, 1, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kretprobe(struct kretprobe *rp) +{ + unregister_kretprobes(&rp, 1); +} + +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + return __register_kretprobes(rps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&rps[i]->kp) < 0) + rps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (rps[i]->kp.addr) { + __unregister_kprobe_bottom(&rps[i]->kp); + cleanup_rp_inst(rps[i]); + } + } +} + #else /* CONFIG_KRETPROBES */ int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { - return 0; + return -ENOSYS; } -#endif /* CONFIG_KRETPROBES */ - void __kprobes unregister_kretprobe(struct kretprobe *rp) { - unsigned long flags; - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; +} - unregister_kprobe(&rp->kp); +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ +} - /* No race here */ - spin_lock_irqsave(&kretprobe_lock, flags); - hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { - ri->rp = NULL; - hlist_del(&ri->uflist); - } - spin_unlock_irqrestore(&kretprobe_lock, flags); - free_rp_inst(rp); +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; } +#endif /* CONFIG_KRETPROBES */ + static int __init init_kprobes(void) { int i, err = 0; + unsigned long offset = 0, size = 0; + char *modname, namebuf[128]; + const char *symbol_name; + void *addr; + struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -819,6 +1008,28 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + /* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + kprobe_lookup_name(kb->name, addr); + if (!addr) + continue; + + kb->start_addr = (unsigned long)addr; + symbol_name = kallsyms_lookup(kb->start_addr, + &size, &offset, &modname, namebuf); + if (!symbol_name) + kb->range = 0; + else + kb->range = size; + } + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -1066,8 +1277,12 @@ module_init(init_kprobes); EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); +EXPORT_SYMBOL_GPL(register_kprobes); +EXPORT_SYMBOL_GPL(unregister_kprobes); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); +EXPORT_SYMBOL_GPL(register_jprobes); +EXPORT_SYMBOL_GPL(unregister_jprobes); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(jprobe_return); #endif @@ -1075,4 +1290,6 @@ EXPORT_SYMBOL_GPL(jprobe_return); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); +EXPORT_SYMBOL_GPL(register_kretprobes); +EXPORT_SYMBOL_GPL(unregister_kretprobes); #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 92cf6930ab5..ac72eea4833 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -144,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), spin_lock(&kthread_create_lock); list_add_tail(&create.list, &kthread_create_list); - wake_up_process(kthreadd_task); spin_unlock(&kthread_create_lock); + wake_up_process(kthreadd_task); wait_for_completion(&create.done); if (!IS_ERR(create.result)) { diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 7c74dab0d21..5e7b45c5692 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -233,14 +233,7 @@ static struct file_operations lstats_fops = { static int __init init_lstats_procfs(void) { - struct proc_dir_entry *pe; - - pe = create_proc_entry("latency_stats", 0644, NULL); - if (!pe) - return -ENOMEM; - - pe->proc_fops = &lstats_fops; - + proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } __initcall(init_lstats_procfs); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 8a135bd163c..dc5d29648d8 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("lockdep", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lockdep_operations; - - entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lockdep_stats_operations; + proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create("lockdep_stats", S_IRUSR, NULL, + &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - entry = create_proc_entry("lock_stat", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lock_stat_operations; + proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); #endif return 0; diff --git a/kernel/marker.c b/kernel/marker.c index 005b9595459..139260e5460 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -23,6 +23,7 @@ #include <linux/rcupdate.h> #include <linux/marker.h> #include <linux/err.h> +#include <linux/slab.h> extern struct marker __start___markers[]; extern struct marker __stop___markers[]; diff --git a/kernel/notifier.c b/kernel/notifier.c index 643360d1bb1..823be11584e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl, return 0; } +static int notifier_chain_cond_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) + return 0; + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { @@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** + * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain, only if not already + * present in the chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + down_write(&nh->rwsem); + ret = notifier_chain_cond_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); + +/** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index aead4d69f62..48d7ed6fc3a 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -7,6 +7,8 @@ #include <linux/module.h> #include <linux/cgroup.h> #include <linux/fs.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> struct ns_cgroup { struct cgroup_subsys_state css; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5d332cf8c6..adc785146a1 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } + /* + * CLONE_NEWIPC must detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. In clone parlance, CLONE_SYSVSEM + * means share undolist with parent, so we must forbid using + * it along with CLONE_NEWIPC. + */ + if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { + err = -EINVAL; + goto out; + } + new_ns = create_new_namespaces(flags, tsk, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); diff --git a/kernel/panic.c b/kernel/panic.c index 24af9f8bac9..425567f45b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic); * 'M' - System experienced a machine check exception. * 'B' - System has hit bad_page. * 'U' - Userspace-defined naughtiness. + * 'A' - ACPI table overridden. + * 'W' - Taint on warning. * * The string is overwritten by the next call to print_taint(). */ @@ -161,7 +163,7 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', @@ -170,7 +172,8 @@ const char *print_tainted(void) tainted & TAINT_BAD_PAGE ? 'B' : ' ', tainted & TAINT_USER ? 'U' : ' ', tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); + tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', + tainted & TAINT_WARN ? 'W' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); @@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line) print_modules(); dump_stack(); print_oops_end_marker(); + add_taint(TAINT_WARN); } EXPORT_SYMBOL(warn_on_slowpath); #endif diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d792b66d85..5ca37fa50be 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level) atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); for (i = 1; i < PIDMAP_ENTRIES; i++) { - ns->pidmap[i].page = 0; + ns->pidmap[i].page = NULL; atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); } diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee..b8628be2a46 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -7,17 +7,39 @@ #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/console.h> +#include <linux/module.h> #include "power.h" #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static int disable_vt_switch; + +/* + * Normally during a suspend, we allocate a new console and switch to it. + * When we resume, we switch back to the original console. This switch + * can be slow, so on systems where the framebuffer can handle restoration + * of video registers anyways, there's little point in doing the console + * switch. This function allows you to disable it by passing it '0'. + */ +void pm_set_vt_switch(int do_switch) +{ + acquire_console_sem(); + disable_vt_switch = !do_switch; + release_console_sem(); +} +EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return 0; + } + orig_fgconsole = fg_console; if (vc_allocate(SUSPEND_CONSOLE)) { @@ -50,9 +72,12 @@ int pm_prepare_console(void) void pm_restore_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return; + } set_console(orig_fgconsole); release_console_sem(); kmsg_redirect = orig_kmsg; - return; } #endif diff --git a/kernel/printk.c b/kernel/printk.c index bdd4ea8c3f2..d3f9c0f788b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1287,31 +1287,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned toks = 10 * 5 * HZ; - static unsigned long last_msg; - static int missed; - unsigned long flags; - unsigned long now = jiffies; - - spin_lock_irqsave(&ratelimit_lock, flags); - toks += now - last_msg; - last_msg = now; - if (toks > (ratelimit_burst * ratelimit_jiffies)) - toks = ratelimit_burst * ratelimit_jiffies; - if (toks >= ratelimit_jiffies) { - int lost = missed; - - missed = 0; - toks -= ratelimit_jiffies; - spin_unlock_irqrestore(&ratelimit_lock, flags); - if (lost) - printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); - return 1; - } - missed++; - spin_unlock_irqrestore(&ratelimit_lock, flags); - return 0; + return __ratelimit(ratelimit_jiffies, ratelimit_burst); } EXPORT_SYMBOL(__printk_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c index 606d7387265..ae7ead82cbc 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -587,10 +587,10 @@ static int __init create_proc_profile(void) return 0; if (create_hash_tables()) return -1; - entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); + entry = proc_create("profile", S_IWUSR | S_IRUGO, + NULL, &proc_profile_operations); if (!entry) return 0; - entry->proc_fops = &proc_profile_operations; entry->size = (1+prof_len) * sizeof(atomic_t); hotcpu_notifier(profile_cpu_callback, 0); return 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 67e392ed549..dac4b4e5729 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) return (copied == sizeof(data)) ? 0 : -EIO; } -#ifdef CONFIG_COMPAT +#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE #include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, @@ -667,7 +667,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data) { @@ -710,6 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, unlock_kernel(); return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ - -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 47894f919d4..33acc424667 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -45,6 +45,7 @@ #include <linux/byteorder/swabb.h> #include <linux/stat.h> #include <linux/srcu.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/relay.c b/kernel/relay.c index d6204a48581..7de644cdec4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = { .close = relay_file_mmap_close, }; +/* + * allocate an array of pointers of struct page + */ +static struct page **relay_alloc_page_array(unsigned int n_pages) +{ + struct page **array; + size_t pa_size = n_pages * sizeof(struct page *); + + if (pa_size > PAGE_SIZE) { + array = vmalloc(pa_size); + if (array) + memset(array, 0, pa_size); + } else { + array = kzalloc(pa_size, GFP_KERNEL); + } + return array; +} + +/* + * free an array of pointers of struct page + */ +static void relay_free_page_array(struct page **array) +{ + if (is_vmalloc_addr(array)) + vfree(array); + else + kfree(array); +} + /** * relay_mmap_buf: - mmap channel buffer to process address space * @buf: relay channel buffer @@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) *size = PAGE_ALIGN(*size); n_pages = *size >> PAGE_SHIFT; - buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); + buf->page_array = relay_alloc_page_array(n_pages); if (!buf->page_array) return NULL; @@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) depopulate: for (j = 0; j < i; j++) __free_page(buf->page_array[j]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); return NULL; } @@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) vunmap(buf->start); for (i = 0; i < buf->page_count; i++) __free_page(buf->page_array[i]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); } chan->buf[buf->cpu] = NULL; kfree(buf->padding); @@ -1162,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); if (ret < 0) break; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index efbfc0fc232..d3c61b4ebef 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> +#include <linux/slab.h> #include <linux/res_counter.h> #include <linux/uaccess.h> @@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) } counter->usage += val; + if (counter->usage > counter->max_usage) + counter->max_usage = counter->usage; return 0; } @@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member) switch (member) { case RES_USAGE: return &counter->usage; + case RES_MAX_USAGE: + return &counter->max_usage; case RES_LIMIT: return &counter->limit; case RES_FAILCNT: @@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} + ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *userbuf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *st_buf, unsigned long long *val)) diff --git a/kernel/resource.c b/kernel/resource.c index cee12cc47ca..74af2d7cb5a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = { static int __init ioresources_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("ioports", 0, NULL); - if (entry) - entry->proc_fops = &proc_ioports_operations; - entry = create_proc_entry("iomem", 0, NULL); - if (entry) - entry->proc_fops = &proc_iomem_operations; + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); return 0; } __initcall(ioresources_init); diff --git a/kernel/sched.c b/kernel/sched.c index 740fb409e5b..e2f7f5acc80 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9057,13 +9057,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { return sched_group_set_shares(cgroup_tg(cgrp), shareval); } -static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); @@ -9073,48 +9073,14 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #ifdef CONFIG_RT_GROUP_SCHED static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) + s64 val) { - char buffer[64]; - int retval = 0; - s64 val; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoll(buffer, &end, 0); - if (*end) - return -EINVAL; - - /* Pass to subsystem */ - retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); - if (!retval) - retval = nbytes; - return retval; + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); } -static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) { - char tmp[64]; - long val = sched_group_rt_runtime(cgroup_tg(cgrp)); - int len = sprintf(tmp, "%ld\n", val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + return sched_group_rt_runtime(cgroup_tg(cgrp)); } static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, @@ -9133,20 +9099,20 @@ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", - .read_uint = cpu_shares_read_uint, - .write_uint = cpu_shares_write_uint, + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, }, #endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", - .read = cpu_rt_runtime_read, - .write = cpu_rt_runtime_write, + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, }, { .name = "rt_period_us", - .read_uint = cpu_rt_period_read_uint, - .write_uint = cpu_rt_period_write_uint, + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, }, #endif }; @@ -9277,8 +9243,8 @@ out: static struct cftype files[] = { { .name = "usage", - .read_uint = cpuusage_read, - .write_uint = cpuusage_write, + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, }, }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index f3f4af4b8b0..8a9498e7c83 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -277,12 +277,9 @@ static int __init init_sched_debug_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("sched_debug", 0644, NULL); + pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &sched_debug_fops; - return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index 6a0cc71ee88..e423d0d9e6f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1545,6 +1545,19 @@ out: * */ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, + cputime_t *utimep, cputime_t *stimep) +{ + *utimep = cputime_add(*utimep, t->utime); + *stimep = cputime_add(*stimep, t->stime); + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; @@ -1554,6 +1567,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; + if (who == RUSAGE_THREAD) { + accumulate_thread_rusage(p, r, &utime, &stime); + goto out; + } + rcu_read_lock(); if (!lock_task_sighand(p, &flags)) { rcu_read_unlock(); @@ -1586,14 +1604,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); + accumulate_thread_rusage(t, r, &utime, &stime); t = next_thread(t); } while (t != p); break; @@ -1605,6 +1616,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unlock_task_sighand(p, &flags); rcu_read_unlock(); +out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -1618,7 +1630,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) asmlinkage long sys_getrusage(int who, struct rusage __user *ru) { - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) return -EINVAL; return getrusage(current, who, ru); } @@ -1632,10 +1645,9 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long error; + long uninitialized_var(error); - error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error) + if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; switch (option) { @@ -1688,17 +1700,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = -EINVAL; break; - case PR_GET_KEEPCAPS: - if (current->keep_capabilities) - error = 1; - break; - case PR_SET_KEEPCAPS: - if (arg2 != 0 && arg2 != 1) { - error = -EINVAL; - break; - } - current->keep_capabilities = arg2; - break; case PR_SET_NAME: { struct task_struct *me = current; unsigned char ncomm[sizeof(me->comm)]; @@ -1732,17 +1733,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2); break; - - case PR_CAPBSET_READ: - if (!cap_valid(arg2)) - return -EINVAL; - return !!cap_raised(current->cap_bset, arg2); - case PR_CAPBSET_DROP: -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES - return cap_prctl_drop(arg2); -#else - return -EINVAL; -#endif case PR_GET_TSC: error = GET_TSC_CTL(arg2); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fd3364827cc..d7ffdc59816 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -38,6 +38,7 @@ #include <linux/writeback.h> #include <linux/hugetlb.h> #include <linux/initrd.h> +#include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> @@ -144,12 +145,6 @@ extern int no_unaligned_warning; extern int max_lock_depth; #endif -#ifdef CONFIG_SYSCTL_SYSCALL -static int parse_table(int __user *, int, void __user *, size_t __user *, - void __user *, size_t, struct ctl_table *); -#endif - - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -809,6 +804,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, +#ifdef CONFIG_KEYS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1430,6 +1433,76 @@ void register_sysctl_root(struct ctl_table_root *root) } #ifdef CONFIG_SYSCTL_SYSCALL +/* Perform the actual read/write of a sysctl table entry. */ +static int do_sysctl_strategy(struct ctl_table_root *root, + struct ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + int op = 0, rc; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (sysctl_perm(root, table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + rc = sysctl_data(table, name, nlen, oldval, oldlenp, + newval, newlen); + if (rc < 0) + return rc; + } + return 0; +} + +static int parse_table(int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + struct ctl_table_root *root, + struct ctl_table *table) +{ + int n; +repeat: + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name || table->procname; table++) { + if (!table->ctl_name) + continue; + if (n == table->ctl_name) { + int error; + if (table->child) { + if (sysctl_perm(root, table, 001)) + return -EPERM; + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(root, table, name, nlen, + oldval, oldlenp, + newval, newlen); + return error; + } + } + return -ENOTDIR; +} + int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1447,7 +1520,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) { error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table); + newval, newlen, + head->root, head->ctl_table); if (error != -ENOTDIR) { sysctl_head_finish(head); break; @@ -1493,84 +1567,22 @@ static int test_perm(int mode, int op) return -EACCES; } -int sysctl_perm(struct ctl_table *table, int op) +int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) { int error; + int mode; + error = security_sysctl(table, op); if (error) return error; - return test_perm(table->mode, op); -} - -#ifdef CONFIG_SYSCTL_SYSCALL -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(table, 001)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(table, name, nlen, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} -/* Perform the actual read/write of a sysctl table entry. */ -int do_sysctl_strategy (struct ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= 004; - if (newval) - op |= 002; - if (sysctl_perm(table, op)) - return -EPERM; + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; - if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } - - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - } - return 0; + return test_perm(mode, op); } -#endif /* CONFIG_SYSCTL_SYSCALL */ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) { @@ -1583,9 +1595,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) static __init int sysctl_init(void) { - int err; sysctl_set_parent(NULL, root_table); - err = sysctl_check_table(current->nsproxy, root_table); +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + { + int err; + err = sysctl_check_table(current->nsproxy, root_table); + } +#endif return 0; } @@ -1712,10 +1728,12 @@ struct ctl_table_header *__register_sysctl_paths( header->unregistering = NULL; header->root = root; sysctl_set_parent(NULL, header->ctl_table); +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK if (sysctl_check_table(namespaces, header->ctl_table)) { kfree(header); return NULL; } +#endif spin_lock(&sysctl_lock); header_list = lookup_header_list(root, namespaces); list_add_tail(&header->ctl_entry, header_list); diff --git a/kernel/time.c b/kernel/time.c index 35d373a9878..86729042e4c 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -35,6 +35,7 @@ #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/unistd.h> diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 67fe8fc21fb..a40e20fd000 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("timer_list", 0644, NULL); + pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &timer_list_fops; - return 0; } __initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 417da8c5bc7..c994530d166 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("timer_stats", 0644, NULL); + pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &tstats_fops; - return 0; } __initcall(init_tstats_procfs); diff --git a/kernel/user.c b/kernel/user.c index debce602bfd..aefbbfa3159 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -53,10 +53,6 @@ struct user_struct root_user = { .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, -#ifdef CONFIG_KEYS - .uid_keyring = &root_user_keyring, - .session_keyring = &root_session_keyring, -#endif #ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif @@ -420,12 +416,12 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) new->mq_bytes = 0; #endif new->locked_shm = 0; - - if (alloc_uid_keyring(new, current) < 0) - goto out_free_user; +#ifdef CONFIG_KEYS + new->uid_keyring = new->session_keyring = NULL; +#endif if (sched_create_user(new) < 0) - goto out_put_keys; + goto out_free_user; if (uids_user_create(new)) goto out_destoy_sched; @@ -459,9 +455,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) out_destoy_sched: sched_destroy_user(new); -out_put_keys: - key_put(new->uid_keyring); - key_put(new->session_keyring); out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4c9006275df..a9ab0596de4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/version.h> #include <linux/nsproxy.h> +#include <linux/slab.h> #include <linux/user_namespace.h> /* @@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref) release_uids(ns); kfree(ns); } +EXPORT_SYMBOL(free_user_ns); diff --git a/kernel/utsname.c b/kernel/utsname.c index 816d7b24fa0..64d398f1244 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/version.h> #include <linux/err.h> +#include <linux/slab.h> /* * Clone a new ns copying an original utsname, setting refcount to 1 diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 00ff4d08e37..7db251a959c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * * Returns 0 if @work was already on a queue, non-zero otherwise. * - * We queue the work to the CPU it was submitted, but there is no - * guarantee that it will be processed by that CPU. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { @@ -772,7 +772,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* * Our caller is either destroy_workqueue() or CPU_DEAD, @@ -808,19 +808,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) void destroy_workqueue(struct workqueue_struct *wq) { const cpumask_t *cpu_map = wq_cpu_map(wq); - struct cpu_workqueue_struct *cwq; int cpu; get_online_cpus(); spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - put_online_cpus(); - for_each_cpu_mask(cpu, *cpu_map) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - cleanup_workqueue_thread(cwq, cpu); - } + for_each_cpu_mask(cpu, *cpu_map) + cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); + put_online_cpus(); free_percpu(wq->cpu_wq); kfree(wq); @@ -838,7 +835,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, action &= ~CPU_TASKS_FROZEN; switch (action) { - case CPU_UP_PREPARE: cpu_set(cpu, cpu_populated_map); } @@ -861,11 +857,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: start_workqueue_thread(cwq, -1); case CPU_DEAD: - cleanup_workqueue_thread(cwq, cpu); + cleanup_workqueue_thread(cwq); break; } } + switch (action) { + case CPU_UP_CANCELED: + case CPU_DEAD: + cpu_clear(cpu, cpu_populated_map); + } + return NOTIFY_OK; } |