diff options
Diffstat (limited to 'kernel')
88 files changed, 7737 insertions, 3713 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 6c584c55a6e..1c9938addb9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,10 +8,10 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o \ - notifier.o ksysfs.o pm_qos_params.o + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + notifier.o ksysfs.o pm_qos_params.o sched_clock.o -obj-$(CONFIG_SYSCTL) += sysctl_check.o +obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o @@ -53,6 +53,7 @@ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/audit.c b/kernel/audit.c index be55cb50363..e092f1c0ce3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -21,7 +21,7 @@ * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * - * Goals: 1) Integrate fully with SELinux. + * Goals: 1) Integrate fully with Security Modules. * 2) Minimal run-time overhead: * a) Minimal when syscall auditing is disabled (audit_enable=0). * b) Small when syscall auditing is enabled and no audit record @@ -55,7 +55,6 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> -#include <linux/selinux.h> #include <linux/inotify.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -127,6 +126,8 @@ static int audit_freelist_count; static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; +/* queue of skbs to send to auditd when/if it comes back */ +static struct sk_buff_head audit_skb_hold_queue; static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); @@ -155,6 +156,11 @@ struct audit_buffer { gfp_t gfp_mask; }; +struct audit_reply { + int pid; + struct sk_buff *skb; +}; + static void audit_set_pid(struct audit_buffer *ab, pid_t pid) { if (ab) { @@ -253,25 +259,26 @@ void audit_log_lost(const char *message) } static int audit_log_config_change(char *function_name, int new, int old, - uid_t loginuid, u32 sid, int allow_changes) + uid_t loginuid, u32 sessionid, u32 sid, + int allow_changes) { struct audit_buffer *ab; int rc = 0; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, - old, loginuid); + audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, + old, loginuid, sessionid); if (sid) { char *ctx = NULL; u32 len; - rc = selinux_sid_to_string(sid, &ctx, &len); + rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) { audit_log_format(ab, " sid=%u", sid); allow_changes = 0; /* Something weird, deny request */ } else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); } } audit_log_format(ab, " res=%d", allow_changes); @@ -280,7 +287,8 @@ static int audit_log_config_change(char *function_name, int new, int old, } static int audit_do_config_change(char *function_name, int *to_change, - int new, uid_t loginuid, u32 sid) + int new, uid_t loginuid, u32 sessionid, + u32 sid) { int allow_changes, rc = 0, old = *to_change; @@ -291,8 +299,8 @@ static int audit_do_config_change(char *function_name, int *to_change, allow_changes = 1; if (audit_enabled != AUDIT_OFF) { - rc = audit_log_config_change(function_name, new, old, - loginuid, sid, allow_changes); + rc = audit_log_config_change(function_name, new, old, loginuid, + sessionid, sid, allow_changes); if (rc) allow_changes = 0; } @@ -306,26 +314,28 @@ static int audit_do_config_change(char *function_name, int *to_change, return rc; } -static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) +static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) { return audit_do_config_change("audit_rate_limit", &audit_rate_limit, - limit, loginuid, sid); + limit, loginuid, sessionid, sid); } -static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) +static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) { return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, - limit, loginuid, sid); + limit, loginuid, sessionid, sid); } -static int audit_set_enabled(int state, uid_t loginuid, u32 sid) +static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) { int rc; if (state < AUDIT_OFF || state > AUDIT_LOCKED) return -EINVAL; rc = audit_do_config_change("audit_enabled", &audit_enabled, state, - loginuid, sid); + loginuid, sessionid, sid); if (!rc) audit_ever_enabled |= !!state; @@ -333,7 +343,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) return rc; } -static int audit_set_failure(int state, uid_t loginuid, u32 sid) +static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) { if (state != AUDIT_FAIL_SILENT && state != AUDIT_FAIL_PRINTK @@ -341,7 +351,43 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) return -EINVAL; return audit_do_config_change("audit_failure", &audit_failure, state, - loginuid, sid); + loginuid, sessionid, sid); +} + +/* + * Queue skbs to be sent to auditd when/if it comes back. These skbs should + * already have been sent via prink/syslog and so if these messages are dropped + * it is not a huge concern since we already passed the audit_log_lost() + * notification and stuff. This is just nice to get audit messages during + * boot before auditd is running or messages generated while auditd is stopped. + * This only holds messages is audit_default is set, aka booting with audit=1 + * or building your kernel that way. + */ +static void audit_hold_skb(struct sk_buff *skb) +{ + if (audit_default && + skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) + skb_queue_tail(&audit_skb_hold_queue, skb); + else + kfree_skb(skb); +} + +static void kauditd_send_skb(struct sk_buff *skb) +{ + int err; + /* take a reference in case we can't send it and we want to hold it */ + skb_get(skb); + err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + if (err < 0) { + BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd dissapeared\n"); + audit_pid = 0; + /* we might get lucky and get this in the next auditd */ + audit_hold_skb(skb); + } else + /* drop the extra reference if sent ok */ + kfree_skb(skb); } static int kauditd_thread(void *dummy) @@ -350,24 +396,41 @@ static int kauditd_thread(void *dummy) set_freezable(); while (!kthread_should_stop()) { + /* + * if auditd just started drain the queue of messages already + * sent to syslog/printk. remember loss here is ok. we already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * if you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ + if (audit_default && audit_pid) { + skb = skb_dequeue(&audit_skb_hold_queue); + if (unlikely(skb)) { + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + } + } + skb = skb_dequeue(&audit_skb_queue); wake_up(&audit_backlog_wait); if (skb) { - if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); - if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ - printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); - audit_pid = 0; - } - } else { + if (audit_pid) + kauditd_send_skb(skb); + else { if (printk_ratelimit()) - printk(KERN_NOTICE "%s\n", skb->data + - NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); else audit_log_lost("printk limit exceeded\n"); - kfree_skb(skb); + + audit_hold_skb(skb); } } else { DECLARE_WAITQUEUE(wait, current); @@ -386,13 +449,13 @@ static int kauditd_thread(void *dummy) return 0; } -static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) +static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) { struct task_struct *tsk; int err; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); err = -ESRCH; if (!tsk) goto out; @@ -405,7 +468,7 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid) if (err) goto out; - tty_audit_push_task(tsk, loginuid); + tty_audit_push_task(tsk, loginuid, sessionid); out: read_unlock(&tasklist_lock); return err; @@ -470,6 +533,19 @@ nlmsg_failure: /* Used by NLMSG_PUT */ return NULL; } +static int audit_send_reply_thread(void *arg) +{ + struct audit_reply *reply = (struct audit_reply *)arg; + + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + /* Ignore failure. It'll only happen if the sender goes away, + because our timeout is set to infinite. */ + netlink_unicast(audit_sock, reply->skb, reply->pid, 0); + kfree(reply); + return 0; +} /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -486,14 +562,27 @@ nlmsg_failure: /* Used by NLMSG_PUT */ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { - struct sk_buff *skb; + struct sk_buff *skb; + struct task_struct *tsk; + struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), + GFP_KERNEL); + + if (!reply) + return; + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) + goto out; + + reply->pid = pid; + reply->skb = skb; + + tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); + if (!IS_ERR(tsk)) return; - /* Ignore failure. It'll only happen if the sender goes away, - because our timeout is set to infinite. */ - netlink_unicast(audit_sock, skb, pid, 0); - return; + kfree_skb(skb); +out: + kfree(reply); } /* @@ -535,7 +624,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) } static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, - u32 pid, u32 uid, uid_t auid, u32 sid) + u32 pid, u32 uid, uid_t auid, u32 ses, + u32 sid) { int rc = 0; char *ctx = NULL; @@ -547,15 +637,16 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u", - pid, uid, auid); + audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + pid, uid, auid, ses); if (sid) { - rc = selinux_sid_to_string(sid, &ctx, &len); + rc = security_secid_to_secctx(sid, &ctx, &len); if (rc) audit_log_format(*ab, " ssid=%u", sid); - else + else { audit_log_format(*ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } return rc; @@ -570,6 +661,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ + u32 sessionid; struct audit_sig_info *sig_data; char *ctx = NULL; u32 len; @@ -591,6 +683,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; loginuid = NETLINK_CB(skb).loginuid; + sessionid = NETLINK_CB(skb).sessionid; sid = NETLINK_CB(skb).sid; seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); @@ -613,12 +706,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, - loginuid, sid); + loginuid, sessionid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, - loginuid, sid); + loginuid, sessionid, sid); if (err < 0) return err; } if (status_get->mask & AUDIT_STATUS_PID) { @@ -627,17 +720,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, loginuid, - sid, 1); + sessionid, sid, 1); audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, - loginuid, sid); + loginuid, sessionid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: @@ -645,16 +738,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!audit_enabled && msg_type != AUDIT_USER_AVC) return 0; - err = audit_filter_user(&NETLINK_CB(skb), msg_type); + err = audit_filter_user(&NETLINK_CB(skb)); if (err == 1) { err = 0; if (msg_type == AUDIT_USER_TTY) { - err = audit_prepare_user_tty(pid, loginuid); + err = audit_prepare_user_tty(pid, loginuid, + sessionid); if (err) break; } audit_log_common_recv_msg(&ab, msg_type, pid, uid, - loginuid, sid); + loginuid, sessionid, sid); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.1024s'", @@ -664,8 +758,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_format(ab, " msg="); size = nlmsg_len(nlh); - audit_log_n_untrustedstring(ab, size, - data); + audit_log_n_untrustedstring(ab, data, size); } audit_set_pid(ab, pid); audit_log_end(ab); @@ -677,7 +770,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -686,9 +779,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -696,7 +789,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); @@ -705,15 +798,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, uid, seq, data, nlmsg_len(nlh), - loginuid, sid); + loginuid, sessionid, sid); break; case AUDIT_TRIM: audit_trim_trees(); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); @@ -721,21 +814,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) case AUDIT_MAKE_EQUIV: { void *bufp = data; u32 sizes[2]; - size_t len = nlmsg_len(nlh); + size_t msglen = nlmsg_len(nlh); char *old, *new; err = -EINVAL; - if (len < 2 * sizeof(u32)) + if (msglen < 2 * sizeof(u32)) break; memcpy(sizes, bufp, 2 * sizeof(u32)); bufp += 2 * sizeof(u32); - len -= 2 * sizeof(u32); - old = audit_unpack_string(&bufp, &len, sizes[0]); + msglen -= 2 * sizeof(u32); + old = audit_unpack_string(&bufp, &msglen, sizes[0]); if (IS_ERR(old)) { err = PTR_ERR(old); break; } - new = audit_unpack_string(&bufp, &len, sizes[1]); + new = audit_unpack_string(&bufp, &msglen, sizes[1]); if (IS_ERR(new)) { err = PTR_ERR(new); kfree(old); @@ -745,7 +838,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = audit_tag_tree(old, new); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, - uid, loginuid, sid); + uid, loginuid, sessionid, sid); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); @@ -758,18 +851,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = selinux_sid_to_string(audit_sig_sid, &ctx, &len); + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); if (err) return err; sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - kfree(ctx); + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; memcpy(sig_data->ctx, ctx, len); - kfree(ctx); + security_release_secctx(ctx, len); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); @@ -779,7 +872,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (!tsk) err = -ESRCH; else { @@ -802,7 +895,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (s->enabled != 0 && s->enabled != 1) return -EINVAL; read_lock(&tasklist_lock); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (!tsk) err = -ESRCH; else { @@ -877,14 +970,11 @@ static int __init audit_init(void) audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; skb_queue_head_init(&audit_skb_queue); + skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = 1; audit_enabled = audit_default; audit_ever_enabled |= !!audit_default; - /* Register the callback with selinux. This callback will be invoked - * when a new policy is loaded. */ - selinux_audit_set_callback(&selinux_audit_rule_update); - audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); #ifdef CONFIG_AUDITSYSCALL @@ -1203,7 +1293,7 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) * This function will take the passed buf and convert it into a string of * ascii hex digits. The new string is placed onto the skb. */ -void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, +void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { int i, avail, new_len; @@ -1239,8 +1329,8 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, * Format a string of no more than slen characters into the audit buffer, * enclosed in quote marks. */ -static void audit_log_n_string(struct audit_buffer *ab, size_t slen, - const char *string) +void audit_log_n_string(struct audit_buffer *ab, const char *string, + size_t slen) { int avail, new_len; unsigned char *ptr; @@ -1269,8 +1359,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, /** * audit_string_contains_control - does a string need to be logged in hex - * @string - string to be checked - * @len - max length of the string to check + * @string: string to be checked + * @len: max length of the string to check */ int audit_string_contains_control(const char *string, size_t len) { @@ -1285,7 +1375,7 @@ int audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: lenth of string (not including trailing null) + * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string @@ -1296,13 +1386,13 @@ int audit_string_contains_control(const char *string, size_t len) * The caller specifies the number of characters in the string to log, which may * or may not be the entire string. */ -void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, - const char *string) +void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, + size_t len) { if (audit_string_contains_control(string, len)) - audit_log_hex(ab, string, len); + audit_log_n_hex(ab, string, len); else - audit_log_n_string(ab, len, string); + audit_log_n_string(ab, string, len); } /** @@ -1315,7 +1405,7 @@ void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, */ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { - audit_log_n_untrustedstring(ab, strlen(string), string); + audit_log_n_untrustedstring(ab, string, strlen(string)); } /* This is a helper-function to print the escaped d_path */ @@ -1359,19 +1449,23 @@ void audit_log_end(struct audit_buffer *ab) audit_log_lost("rate limit exceeded"); } else { struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + if (audit_pid) { - nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); skb_queue_tail(&audit_skb_queue, ab->skb); - ab->skb = NULL; wake_up_interruptible(&kauditd_wait); - } else if (nlh->nlmsg_type != AUDIT_EOE) { - if (printk_ratelimit()) { - printk(KERN_NOTICE "type=%d %s\n", - nlh->nlmsg_type, - ab->skb->data + NLMSG_SPACE(0)); - } else - audit_log_lost("printk limit exceeded\n"); + } else { + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) { + printk(KERN_NOTICE "type=%d %s\n", + nlh->nlmsg_type, + ab->skb->data + NLMSG_SPACE(0)); + } else + audit_log_lost("printk limit exceeded\n"); + } + audit_hold_skb(ab->skb); } + ab->skb = NULL; } audit_buffer_free(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 2554bd524fd..9d6717412fe 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -65,40 +65,20 @@ struct audit_watch { struct list_head rules; /* associated rules */ }; -struct audit_field { - u32 type; - u32 val; - u32 op; - char *se_str; - struct selinux_audit_rule *se_rule; -}; - struct audit_tree; struct audit_chunk; -struct audit_krule { - int vers_ops; - u32 flags; - u32 listnr; - u32 action; - u32 mask[AUDIT_BITMASK_SIZE]; - u32 buflen; /* for data alloc on list rules */ - u32 field_count; - char *filterkey; /* ties events to rules */ - struct audit_field *fields; - struct audit_field *arch_f; /* quick access to arch field */ - struct audit_field *inode_f; /* quick access to an inode field */ - struct audit_watch *watch; /* associated watch */ - struct audit_tree *tree; /* associated watched tree */ - struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ -}; - struct audit_entry { struct list_head list; struct rcu_head rcu; struct audit_krule rule; }; +#ifdef CONFIG_AUDIT +extern int audit_enabled; +extern int audit_ever_enabled; +#endif + extern int audit_pid; #define AUDIT_INODE_BUCKETS 32 @@ -129,6 +109,9 @@ struct audit_netlink_list { int audit_send_list(void *); struct inotify_watch; +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + extern void audit_free_parent(struct inotify_watch *); extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, const char *, struct inode *); @@ -136,6 +119,7 @@ extern int selinux_audit_rule_update(void); extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); +extern struct list_head audit_filter_list[]; #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); @@ -162,6 +146,10 @@ extern void audit_put_tree(struct audit_tree *); extern char *audit_unpack_string(void **, size_t *, size_t); +extern pid_t audit_sig_pid; +extern uid_t audit_sig_uid; +extern u32 audit_sig_sid; + #ifdef CONFIG_AUDITSYSCALL extern int __audit_signal_info(int sig, struct task_struct *t); static inline int audit_signal_info(int sig, struct task_struct *t) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 9ef5e0aacc3..f7921a2ecf1 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -172,10 +172,9 @@ static void insert_hash(struct audit_chunk *chunk) struct audit_chunk *audit_tree_lookup(const struct inode *inode) { struct list_head *list = chunk_hash(inode); - struct list_head *pos; + struct audit_chunk *p; - list_for_each_rcu(pos, list) { - struct audit_chunk *p = container_of(pos, struct audit_chunk, hash); + list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { get_inotify_watch(&p->watch); return p; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 2f2914b7cc3..98c50cc671b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -28,7 +28,7 @@ #include <linux/netlink.h> #include <linux/sched.h> #include <linux/inotify.h> -#include <linux/selinux.h> +#include <linux/security.h> #include "audit.h" /* @@ -38,7 +38,7 @@ * Synchronizes writes and blocking reads of audit's filterlist * data. Rcu is used to traverse the filterlist and access * contents of structs audit_entry, audit_watch and opaque - * selinux rules during filtering. If modified, these structures + * LSM rules during filtering. If modified, these structures * must be copied and replace their counterparts in the filterlist. * An audit_parent struct is not accessed during filtering, so may * be written directly provided audit_filter_mutex is held. @@ -89,14 +89,9 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { DEFINE_MUTEX(audit_filter_mutex); -/* Inotify handle */ -extern struct inotify_handle *audit_ih; - /* Inotify events we care about. */ #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF -extern int audit_enabled; - void audit_free_parent(struct inotify_watch *i_watch) { struct audit_parent *parent; @@ -139,8 +134,8 @@ static inline void audit_free_rule(struct audit_entry *e) if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; - kfree(f->se_str); - selinux_audit_rule_free(f->se_rule); + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); } kfree(e->rule.fields); kfree(e->rule.filterkey); @@ -272,7 +267,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len, return -EINVAL; watch = audit_init_watch(path); - if (unlikely(IS_ERR(watch))) + if (IS_ERR(watch)) return PTR_ERR(watch); audit_get_watch(watch); @@ -422,7 +417,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; - struct audit_field *f; + struct audit_field *ino_f; int err = 0; int i; @@ -483,6 +478,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) if (f->val & ~15) goto exit_free; break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -504,9 +503,9 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } - f = entry->rule.inode_f; - if (f) { - switch(f->op) { + ino_f = entry->rule.inode_f; + if (ino_f) { + switch(ino_f->op) { case AUDIT_NOT_EQUAL: entry->rule.inode_f = NULL; case AUDIT_EQUAL: @@ -531,7 +530,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; - struct audit_field *f; + struct audit_field *ino_f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -554,8 +553,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->op = data->fieldflags[i] & AUDIT_OPERATORS; f->type = data->fields[i]; f->val = data->values[i]; - f->se_str = NULL; - f->se_rule = NULL; + f->lsm_str = NULL; + f->lsm_rule = NULL; switch(f->type) { case AUDIT_PID: case AUDIT_UID: @@ -597,12 +596,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; entry->rule.buflen += f->val; - err = selinux_audit_rule_init(f->type, f->op, str, - &f->se_rule); + err = security_audit_rule_init(f->type, f->op, str, + (void **)&f->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { - printk(KERN_WARNING "audit rule for selinux " + printk(KERN_WARNING "audit rule for LSM " "\'%s\' is invalid\n", str); err = 0; } @@ -610,7 +609,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, kfree(str); goto exit_free; } else - f->se_str = str; + f->lsm_str = str; break; case AUDIT_WATCH: str = audit_unpack_string(&bufp, &remain, f->val); @@ -654,14 +653,18 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (f->val & ~15) goto exit_free; break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; default: goto exit_free; } } - f = entry->rule.inode_f; - if (f) { - switch(f->op) { + ino_f = entry->rule.inode_f; + if (ino_f) { + switch(ino_f->op) { case AUDIT_NOT_EQUAL: entry->rule.inode_f = NULL; case AUDIT_EQUAL: @@ -754,7 +757,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: data->buflen += data->values[i] = - audit_pack_string(&bufp, f->se_str); + audit_pack_string(&bufp, f->lsm_str); break; case AUDIT_WATCH: data->buflen += data->values[i] = @@ -806,7 +809,7 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) + if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) return 1; break; case AUDIT_WATCH: @@ -848,7 +851,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) return ERR_PTR(-ENOMEM); new = audit_init_watch(path); - if (unlikely(IS_ERR(new))) { + if (IS_ERR(new)) { kfree(path); goto out; } @@ -862,28 +865,28 @@ out: return new; } -/* Duplicate selinux field information. The se_rule is opaque, so must be +/* Duplicate LSM field information. The lsm_rule is opaque, so must be * re-initialized. */ -static inline int audit_dupe_selinux_field(struct audit_field *df, +static inline int audit_dupe_lsm_field(struct audit_field *df, struct audit_field *sf) { int ret = 0; - char *se_str; + char *lsm_str; - /* our own copy of se_str */ - se_str = kstrdup(sf->se_str, GFP_KERNEL); - if (unlikely(!se_str)) + /* our own copy of lsm_str */ + lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); + if (unlikely(!lsm_str)) return -ENOMEM; - df->se_str = se_str; + df->lsm_str = lsm_str; - /* our own (refreshed) copy of se_rule */ - ret = selinux_audit_rule_init(df->type, df->op, df->se_str, - &df->se_rule); + /* our own (refreshed) copy of lsm_rule */ + ret = security_audit_rule_init(df->type, df->op, df->lsm_str, + (void **)&df->lsm_rule); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { - printk(KERN_WARNING "audit rule for selinux \'%s\' is " - "invalid\n", df->se_str); + printk(KERN_WARNING "audit rule for LSM \'%s\' is " + "invalid\n", df->lsm_str); ret = 0; } @@ -891,7 +894,7 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, } /* Duplicate an audit rule. This will be a deep copy with the exception - * of the watch - that pointer is carried over. The selinux specific fields + * of the watch - that pointer is carried over. The LSM specific fields * will be updated in the copy. The point is to be able to replace the old * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from @@ -930,7 +933,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->tree = old->tree; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); - /* deep copy this information, updating the se_rule fields, because + /* deep copy this information, updating the lsm_rule fields, because * the originals will all be freed when the old rule is freed. */ for (i = 0; i < fcount; i++) { switch (new->fields[i].type) { @@ -944,7 +947,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old, case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: - err = audit_dupe_selinux_field(&new->fields[i], + err = audit_dupe_lsm_field(&new->fields[i], &old->fields[i]); break; case AUDIT_FILTERKEY: @@ -989,7 +992,7 @@ static void audit_update_watch(struct audit_parent *parent, audit_set_auditable(current->audit_context); nwatch = audit_dupe_watch(owatch); - if (unlikely(IS_ERR(nwatch))) { + if (IS_ERR(nwatch)) { mutex_unlock(&audit_filter_mutex); audit_panic("error updating watch, skipping"); return; @@ -1004,7 +1007,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del_rcu(&oentry->list); nentry = audit_dupe_rule(&oentry->rule, nwatch); - if (unlikely(IS_ERR(nentry))) + if (IS_ERR(nentry)) audit_panic("error updating watch, removing"); else { int h = audit_hash_ino((u32)ino); @@ -1500,8 +1503,9 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) } /* Log rule additions and removals */ -static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, - struct audit_krule *rule, int res) +static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, + char *action, struct audit_krule *rule, + int res) { struct audit_buffer *ab; @@ -1511,15 +1515,16 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; - audit_log_format(ab, "auid=%u", loginuid); + audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); if (sid) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string(sid, &ctx, &len)) + if (security_secid_to_secctx(sid, &ctx, &len)) audit_log_format(ab, " ssid=%u", sid); - else + else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " op=%s rule key=", action); if (rule->filterkey) @@ -1539,10 +1544,11 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, * @data: payload data * @datasz: size of payload data * @loginuid: loginuid of sender + * @sessionid: sessionid for netlink audit message * @sid: SE Linux Security ID of sender */ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, - size_t datasz, uid_t loginuid, u32 sid) + size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) { struct task_struct *tsk; struct audit_netlink_list *dest; @@ -1589,7 +1595,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err); + audit_log_rule_change(loginuid, sessionid, sid, "add", + &entry->rule, !err); if (err) audit_free_rule(entry); @@ -1605,8 +1612,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_del_rule(entry, &audit_filter_list[entry->rule.listnr]); - audit_log_rule_change(loginuid, sid, "remove", &entry->rule, - !err); + audit_log_rule_change(loginuid, sessionid, sid, "remove", + &entry->rule, !err); audit_free_rule(entry); break; @@ -1714,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, return 1; } -int audit_filter_user(struct netlink_skb_parms *cb, int type) +int audit_filter_user(struct netlink_skb_parms *cb) { enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; @@ -1761,38 +1768,12 @@ unlock_and_return: return result; } -/* Check to see if the rule contains any selinux fields. Returns 1 if there - are selinux fields specified in the rule, 0 otherwise. */ -static inline int audit_rule_has_selinux(struct audit_krule *rule) -{ - int i; - - for (i = 0; i < rule->field_count; i++) { - struct audit_field *f = &rule->fields[i]; - switch (f->type) { - case AUDIT_SUBJ_USER: - case AUDIT_SUBJ_ROLE: - case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: - case AUDIT_OBJ_USER: - case AUDIT_OBJ_ROLE: - case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: - return 1; - } - } - - return 0; -} - -/* This function will re-initialize the se_rule field of all applicable rules. - * It will traverse the filter lists serarching for rules that contain selinux +/* This function will re-initialize the lsm_rule field of all applicable rules. + * It will traverse the filter lists serarching for rules that contain LSM * specific filter fields. When such a rule is found, it is copied, the - * selinux field is re-initialized, and the old rule is replaced with the + * LSM field is re-initialized, and the old rule is replaced with the * updated rule. */ -int selinux_audit_rule_update(void) +int audit_update_lsm_rules(void) { struct audit_entry *entry, *n, *nentry; struct audit_watch *watch; @@ -1804,18 +1785,18 @@ int selinux_audit_rule_update(void) for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { - if (!audit_rule_has_selinux(&entry->rule)) + if (!security_audit_rule_known(&entry->rule)) continue; watch = entry->rule.watch; tree = entry->rule.tree; nentry = audit_dupe_rule(&entry->rule, watch); - if (unlikely(IS_ERR(nentry))) { + if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ if (!err) err = PTR_ERR(nentry); - audit_panic("error updating selinux filters"); + audit_panic("error updating LSM filters"); if (watch) list_del(&entry->rule.rlist); list_del_rcu(&entry->list); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 782262e4107..c10e7aae04d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -61,7 +61,6 @@ #include <linux/security.h> #include <linux/list.h> #include <linux/tty.h> -#include <linux/selinux.h> #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -69,9 +68,6 @@ #include "audit.h" -extern struct list_head audit_filter_list[]; -extern int audit_ever_enabled; - /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). */ #define AUDIT_NAMES 20 @@ -284,6 +280,19 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } +static int audit_match_filetype(struct audit_context *ctx, int which) +{ + unsigned index = which & ~S_IFMT; + mode_t mode = which & S_IFMT; + if (index >= ctx->name_count) + return 0; + if (ctx->names[index].ino == -1) + return 0; + if ((ctx->names[index].mode ^ mode) & S_IFMT) + return 0; + return 1; +} + /* * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; * ->first_trees points to its beginning, ->trees - to the current end of data. @@ -528,14 +537,14 @@ static int audit_filter_rules(struct task_struct *tsk, match for now to avoid losing information that may be wanted. An error message will also be logged upon error */ - if (f->se_rule) { + if (f->lsm_rule) { if (need_sid) { - selinux_get_task_sid(tsk, &sid); + security_task_getsecid(tsk, &sid); need_sid = 0; } - result = selinux_audit_rule_match(sid, f->type, + result = security_audit_rule_match(sid, f->type, f->op, - f->se_rule, + f->lsm_rule, ctx); } break; @@ -546,18 +555,18 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_OBJ_LEV_HIGH: /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR also applies here */ - if (f->se_rule) { + if (f->lsm_rule) { /* Find files that match */ if (name) { - result = selinux_audit_rule_match( + result = security_audit_rule_match( name->osid, f->type, f->op, - f->se_rule, ctx); + f->lsm_rule, ctx); } else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (selinux_audit_rule_match( + if (security_audit_rule_match( ctx->names[j].osid, f->type, f->op, - f->se_rule, ctx)) { + f->lsm_rule, ctx)) { ++result; break; } @@ -570,7 +579,7 @@ static int audit_filter_rules(struct task_struct *tsk, aux = aux->next) { if (aux->type == AUDIT_IPC) { struct audit_aux_data_ipcctl *axi = (void *)aux; - if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) { + if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) { ++result; break; } @@ -593,6 +602,9 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PERM: result = audit_match_perm(ctx, f->val); break; + case AUDIT_FILETYPE: + result = audit_match_filetype(ctx, f->val); + break; } if (!result) @@ -885,11 +897,11 @@ void audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - selinux_get_task_sid(current, &sid); + security_task_getsecid(current, &sid); if (!sid) return; - error = selinux_sid_to_string(sid, &ctx, &len); + error = security_secid_to_secctx(sid, &ctx, &len); if (error) { if (error != -EINVAL) goto error_path; @@ -897,7 +909,7 @@ void audit_log_task_context(struct audit_buffer *ab) } audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); return; error_path: @@ -941,7 +953,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, u32 sid, char *comm) { struct audit_buffer *ab; - char *s = NULL; + char *ctx = NULL; u32 len; int rc = 0; @@ -951,15 +963,16 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, uid, sessionid); - if (selinux_sid_to_string(sid, &s, &len)) { + if (security_secid_to_secctx(sid, &ctx, &len)) { audit_log_format(ab, " obj=(none)"); rc = 1; - } else - audit_log_format(ab, " obj=%s", s); + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); - kfree(s); return rc; } @@ -1095,7 +1108,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, audit_log_format(*ab, "[%d]", i); audit_log_format(*ab, "="); if (has_cntl) - audit_log_hex(*ab, buf, to_send); + audit_log_n_hex(*ab, buf, to_send); else audit_log_format(*ab, "\"%s\"", buf); audit_log_format(*ab, "\n"); @@ -1271,14 +1284,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (axi->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string( + if (security_secid_to_secctx( axi->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", axi->osid); call_panic = 1; - } else + } else { audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } break; } @@ -1295,7 +1309,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts break; } case AUDIT_SOCKETCALL: { - int i; struct audit_aux_data_socketcall *axs = (void *)aux; audit_log_format(ab, "nargs=%d", axs->nargs); for (i=0; i<axs->nargs; i++) @@ -1306,7 +1319,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_aux_data_sockaddr *axs = (void *)aux; audit_log_format(ab, "saddr="); - audit_log_hex(ab, axs->a, axs->len); + audit_log_n_hex(ab, axs->a, axs->len); break; } case AUDIT_FD_PAIR: { @@ -1320,7 +1333,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts for (aux = context->aux_pids; aux; aux = aux->next) { struct audit_aux_data_pids *axs = (void *)aux; - int i; for (i = 0; i < axs->pid_count; i++) if (audit_log_pid_context(context, axs->target_pid[i], @@ -1370,8 +1382,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts default: /* log the name's directory component */ audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name_len, - n->name); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); } } else audit_log_format(ab, " name=(null)"); @@ -1392,13 +1404,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (n->osid != 0) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string( + if (security_secid_to_secctx( n->osid, &ctx, &len)) { audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; - } else + } else { audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_end(ab); @@ -1594,7 +1607,7 @@ static inline void handle_one(const struct inode *inode) if (likely(put_tree_ref(context, chunk))) return; if (unlikely(!grow_tree_refs(context))) { - printk(KERN_WARNING "out of memory, audit has lost a tree reference"); + printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); audit_set_auditable(context); audit_put_chunk(chunk); unroll_tree_refs(context, p, count); @@ -1654,7 +1667,7 @@ retry: } /* too bad */ printk(KERN_WARNING - "out of memory, audit has lost a tree reference"); + "out of memory, audit has lost a tree reference\n"); unroll_tree_refs(context, p, count); audit_set_auditable(context); return; @@ -1750,13 +1763,13 @@ static int audit_inc_name_count(struct audit_context *context, if (context->name_count >= AUDIT_NAMES) { if (inode) printk(KERN_DEBUG "name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu", + "dev=%02x:%02x, inode=%lu\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); else - printk(KERN_DEBUG "name_count maxed, losing inode data"); + printk(KERN_DEBUG "name_count maxed, losing inode data\n"); return 1; } context->name_count++; @@ -1775,7 +1788,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode name->uid = inode->i_uid; name->gid = inode->i_gid; name->rdev = inode->i_rdev; - selinux_get_inode_sid(inode, &name->osid); + security_inode_getsecid(inode, &name->osid); } /** @@ -2190,8 +2203,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) ax->uid = ipcp->uid; ax->gid = ipcp->gid; ax->mode = ipcp->mode; - selinux_get_ipc_sid(ipcp, &ax->osid); - + security_ipc_getsecid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC; ax->d.next = context->aux; context->aux = (void *)ax; @@ -2343,7 +2355,7 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = t->uid; context->target_sessionid = audit_get_sessionid(t); - selinux_get_task_sid(t, &context->target_sid); + security_task_getsecid(t, &context->target_sid); memcpy(context->target_comm, t->comm, TASK_COMM_LEN); } @@ -2360,9 +2372,6 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_aux_data_pids *axp; struct task_struct *tsk = current; struct audit_context *ctx = tsk->audit_context; - extern pid_t audit_sig_pid; - extern uid_t audit_sig_uid; - extern u32 audit_sig_sid; if (audit_pid && t->tgid == audit_pid) { if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { @@ -2371,7 +2380,7 @@ int __audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = tsk->loginuid; else audit_sig_uid = tsk->uid; - selinux_get_task_sid(tsk, &audit_sig_sid); + security_task_getsecid(tsk, &audit_sig_sid); } if (!audit_signals || audit_dummy_context()) return 0; @@ -2384,7 +2393,7 @@ int __audit_signal_info(int sig, struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t->uid; ctx->target_sessionid = audit_get_sessionid(t); - selinux_get_task_sid(t, &ctx->target_sid); + security_task_getsecid(t, &ctx->target_sid); memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); return 0; } @@ -2405,7 +2414,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->target_auid[axp->pid_count] = audit_get_loginuid(t); axp->target_uid[axp->pid_count] = t->uid; axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); - selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); + security_task_getsecid(t, &axp->target_sid[axp->pid_count]); memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); axp->pid_count++; @@ -2435,16 +2444,17 @@ void audit_core_dumps(long signr) ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", auid, current->uid, current->gid, sessionid); - selinux_get_task_sid(current, &sid); + security_task_getsecid(current, &sid); if (sid) { char *ctx = NULL; u32 len; - if (selinux_sid_to_string(sid, &ctx, &len)) + if (security_secid_to_secctx(sid, &ctx, &len)) audit_log_format(ab, " ssid=%u", sid); - else + else { audit_log_format(ab, " subj=%s", ctx); - kfree(ctx); + security_release_secctx(ctx, len); + } } audit_log_format(ab, " pid=%d comm=", current->pid); audit_log_untrustedstring(ab, current->comm); diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 00000000000..3c530138183 --- /dev/null +++ b/kernel/bounds.c @@ -0,0 +1,19 @@ +/* + * Generate definitions needed by the preprocessor. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#define __GENERATING_BOUNDS_H +/* Include headers that define the enum constants of interest */ +#include <linux/page-flags.h> +#include <linux/mmzone.h> +#include <linux/kbuild.h> + +void foo(void) +{ + /* The enum constants to put into include/linux/bounds.h */ + DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); + DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + /* End of constants */ +} diff --git a/kernel/capability.c b/kernel/capability.c index 39e8193b41e..cfbe4429948 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -53,6 +53,69 @@ static void warn_legacy_capability_use(void) } /* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + +/* * For sys_getproccap() and sys_setproccap(), any of the three * capability set pointers may be NULL -- indicating that that set is * uninteresting and/or not to be changed. @@ -71,27 +134,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { int ret = 0; pid_t pid; - __u32 version; struct task_struct *target; unsigned tocopy; kernel_cap_t pE, pI, pP; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -118,7 +167,7 @@ out: spin_unlock(&task_capability_lock); if (!ret) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { @@ -128,7 +177,7 @@ out: } /* - * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability @@ -240,30 +289,16 @@ static inline int cap_set_all(kernel_cap_t *effective, */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { - struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy; kernel_cap_t inheritable, permitted, effective; - __u32 version; struct task_struct *target; int ret; pid_t pid; - if (get_user(version, &header->version)) - return -EFAULT; - - switch (version) { - case _LINUX_CAPABILITY_VERSION_1: - warn_legacy_capability_use(); - tocopy = _LINUX_CAPABILITY_U32S_1; - break; - case _LINUX_CAPABILITY_VERSION_2: - tocopy = _LINUX_CAPABILITY_U32S_2; - break; - default: - if (put_user(_LINUX_CAPABILITY_VERSION, &header->version)) - return -EFAULT; - return -EINVAL; - } + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; if (get_user(pid, &header->pid)) return -EFAULT; @@ -281,7 +316,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } - while (i < _LINUX_CAPABILITY_U32S) { + while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9c2fb01e89..15ac0e1e4f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -44,6 +44,7 @@ #include <linux/kmod.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> +#include <linux/hash.h> #include <asm/atomic.h> @@ -118,17 +119,7 @@ static int root_count; * be called. */ static int need_forkexit_callback; - -/* bits in struct cgroup flags field */ -enum { - /* Control Group is dead */ - CGRP_REMOVED, - /* Control Group has previously had a child cgroup or a task, - * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ - CGRP_RELEASABLE, - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, -}; +static int need_mm_owner_callback __read_mostly; /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) @@ -204,6 +195,27 @@ static struct cg_cgroup_link init_css_set_link; static DEFINE_RWLOCK(css_set_lock); static int css_set_count; +/* hash table for cgroup groups. This improves the performance to + * find an existing css_set */ +#define CSS_SET_HASH_BITS 7 +#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) +static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; + +static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +{ + int i; + int index; + unsigned long tmp = 0UL; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + tmp += (unsigned long)css[i]; + tmp = (tmp >> 16) ^ tmp; + + index = hash_long(tmp, CSS_SET_HASH_BITS); + + return &css_set_table[index]; +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups @@ -230,7 +242,7 @@ static int use_task_css_set_links; static void unlink_css_set(struct css_set *cg) { write_lock(&css_set_lock); - list_del(&cg->list); + hlist_del(&cg->hlist); css_set_count--; while (!list_empty(&cg->cg_links)) { struct cg_cgroup_link *link; @@ -295,9 +307,7 @@ static inline void put_css_set_taskexit(struct css_set *cg) /* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing - * css_set is suitable. This currently walks a linked-list for - * simplicity; a later patch will use a hash table for better - * performance + * css_set is suitable. * * oldcg: the cgroup group that we're using before the cgroup * transition @@ -314,7 +324,9 @@ static struct css_set *find_existing_css_set( { int i; struct cgroupfs_root *root = cgrp->root; - struct list_head *l = &init_css_set.list; + struct hlist_head *hhead; + struct hlist_node *node; + struct css_set *cg; /* Built the set of subsystem state objects that we want to * see in the new css_set */ @@ -331,18 +343,13 @@ static struct css_set *find_existing_css_set( } } - /* Look through existing cgroup groups to find one to reuse */ - do { - struct css_set *cg = - list_entry(l, struct css_set, list); - + hhead = css_set_hash(template); + hlist_for_each_entry(cg, node, hhead, hlist) { if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { /* All subsystems matched */ return cg; } - /* Try the next cgroup group */ - l = l->next; - } while (l != &init_css_set.list); + } /* No existing cgroup group matched */ return NULL; @@ -404,6 +411,8 @@ static struct css_set *find_css_set( struct list_head tmp_cg_links; struct cg_cgroup_link *link; + struct hlist_head *hhead; + /* First see if we already have a cgroup group that matches * the desired set */ write_lock(&css_set_lock); @@ -428,6 +437,7 @@ static struct css_set *find_css_set( kref_init(&res->ref); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); + INIT_HLIST_NODE(&res->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ @@ -467,9 +477,12 @@ static struct css_set *find_css_set( BUG_ON(!list_empty(&tmp_cg_links)); - /* Link this cgroup group into the list */ - list_add(&res->list, &init_css_set.list); css_set_count++; + + /* Add this cgroup group to the hash table */ + hhead = css_set_hash(res->subsys); + hlist_add_head(&res->hlist, hhead); + write_unlock(&css_set_lock); return res; @@ -562,7 +575,7 @@ static struct inode_operations cgroup_dir_inode_operations; static struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) @@ -782,7 +795,14 @@ static int parse_cgroupfs_options(char *data, if (!*token) return -EINVAL; if (!strcmp(token, "all")) { - opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1; + /* Add all non-disabled subsystems */ + int i; + opts->subsys_bits = 0; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (!ss->disabled) + opts->subsys_bits |= 1ul << i; + } } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { @@ -800,7 +820,8 @@ static int parse_cgroupfs_options(char *data, for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (!strcmp(token, ss->name)) { - set_bit(i, &opts->subsys_bits); + if (!ss->disabled) + set_bit(i, &opts->subsys_bits); break; } } @@ -940,7 +961,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *root; - struct list_head tmp_cg_links, *l; + struct list_head tmp_cg_links; INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ @@ -982,6 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, /* New superblock */ struct cgroup *cgrp = &root->top_cgroup; struct inode *inode; + int i; BUG_ON(sb->s_root != NULL); @@ -1026,22 +1048,25 @@ static int cgroup_get_sb(struct file_system_type *fs_type, /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); - l = &init_css_set.list; - do { + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct hlist_head *hhead = &css_set_table[i]; + struct hlist_node *node; struct css_set *cg; - struct cg_cgroup_link *link; - cg = list_entry(l, struct css_set, list); - BUG_ON(list_empty(&tmp_cg_links)); - link = list_entry(tmp_cg_links.next, - struct cg_cgroup_link, - cgrp_link_list); - list_del(&link->cgrp_link_list); - link->cg = cg; - list_add(&link->cgrp_link_list, - &root->top_cgroup.css_sets); - list_add(&link->cg_link_list, &cg->cg_links); - l = l->next; - } while (l != &init_css_set.list); + + hlist_for_each_entry(cg, node, hhead, hlist) { + struct cg_cgroup_link *link; + + BUG_ON(list_empty(&tmp_cg_links)); + link = list_entry(tmp_cg_links.next, + struct cg_cgroup_link, + cgrp_link_list); + list_del(&link->cgrp_link_list); + link->cg = cg; + list_add(&link->cgrp_link_list, + &root->top_cgroup.css_sets); + list_add(&link->cg_link_list, &cg->cg_links); + } + } write_unlock(&css_set_lock); free_cg_links(&tmp_cg_links); @@ -1299,18 +1324,16 @@ enum cgroup_filetype { FILE_DIR, FILE_TASKLIST, FILE_NOTIFY_ON_RELEASE, - FILE_RELEASABLE, FILE_RELEASE_AGENT, }; -static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { char buffer[64]; int retval = 0; - u64 val; char *end; if (!nbytes) @@ -1321,16 +1344,18 @@ static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoull(buffer, &end, 0); - if (*end) - return -EINVAL; - - /* Pass to subsystem */ - retval = cft->write_uint(cgrp, cft, val); + strstrip(buffer); + if (cft->write_u64) { + u64 val = simple_strtoull(buffer, &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_u64(cgrp, cft, val); + } else { + s64 val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_s64(cgrp, cft, val); + } if (!retval) retval = nbytes; return retval; @@ -1411,23 +1436,39 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_uint) - return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_u64 || cft->write_s64) + return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->trigger) { + int ret = cft->trigger(cgrp, (unsigned int)cft->private); + return ret ? ret : nbytes; + } return -EINVAL; } -static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { char tmp[64]; - u64 val = cft->read_uint(cgrp, cft); + u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } +static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[64]; + s64 val = cft->read_s64(cgrp, cft); + int len = sprintf(tmp, "%lld\n", (long long) val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + static ssize_t cgroup_common_file_read(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -1482,11 +1523,56 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_uint) - return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_u64) + return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_s64) + return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + if (cft->read_map) { + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + return cft->read_map(state->cgroup, cft, &cb); + } + return cft->read_seq_string(state->cgroup, cft, m); +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1499,7 +1585,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map || cft->read_seq_string) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; @@ -1707,14 +1804,19 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * The tasklist_lock is not held here, as do_each_thread() and * while_each_thread() are protected by RCU. */ -void cgroup_enable_task_cg_lists(void) +static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; do_each_thread(g, p) { task_lock(p); - if (list_empty(&p->cg_list)) + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + */ + if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); @@ -1900,14 +2002,14 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) if (heap->size) { for (i = 0; i < heap->size; i++) { - struct task_struct *p = heap->ptrs[i]; + struct task_struct *q = heap->ptrs[i]; if (i == 0) { - latest_time = p->start_time; - latest_task = p; + latest_time = q->start_time; + latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(p, scan); - put_task_struct(p); + scan->process_task(q, scan); + put_task_struct(q); } /* * If we had to process any tasks at all, scan again @@ -2082,7 +2184,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) kfree(pidarray); } else { - ctr->buf = 0; + ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; @@ -2125,11 +2227,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, return notify_on_release(cgrp); } -static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - /* * for the common functions, 'private' gives the type of file */ @@ -2145,16 +2242,10 @@ static struct cftype files[] = { { .name = "notify_on_release", - .read_uint = cgroup_read_notify_on_release, + .read_u64 = cgroup_read_notify_on_release, .write = cgroup_common_file_write, .private = FILE_NOTIFY_ON_RELEASE, }, - - { - .name = "releasable", - .read_uint = cgroup_read_releasable, - .private = FILE_RELEASABLE, - } }; static struct cftype cft_release_agent = { @@ -2388,10 +2479,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) return 0; } -static void cgroup_init_subsys(struct cgroup_subsys *ss) +static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; - struct list_head *l; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); @@ -2402,34 +2492,19 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); - /* Update all cgroup groups to contain a subsys + /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is - * newly registered, all tasks and hence all cgroup - * groups are in the subsystem's top cgroup. */ - write_lock(&css_set_lock); - l = &init_css_set.list; - do { - struct css_set *cg = - list_entry(l, struct css_set, list); - cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; - l = l->next; - } while (l != &init_css_set.list); - write_unlock(&css_set_lock); - - /* If this subsystem requested that it be notified with fork - * events, we should send it one now for every process in the - * system */ - if (ss->fork) { - struct task_struct *g, *p; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - ss->fork(ss, p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - } + * newly registered, all tasks and hence the + * init_css_set is in the subsystem's top cgroup. */ + init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; + need_mm_owner_callback |= !!ss->mm_owner_changed; + + /* At system boot, before all subsystems have been + * registered, no tasks have been forked, so we don't + * need to invoke fork callbacks here. */ + BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; } @@ -2445,9 +2520,9 @@ int __init cgroup_init_early(void) int i; kref_init(&init_css_set.ref); kref_get(&init_css_set.ref); - INIT_LIST_HEAD(&init_css_set.list); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); + INIT_HLIST_NODE(&init_css_set.hlist); css_set_count = 1; init_cgroup_root(&rootnode); list_add(&rootnode.root_list, &roots); @@ -2460,6 +2535,9 @@ int __init cgroup_init_early(void) list_add(&init_css_set_link.cg_link_list, &init_css_set.cg_links); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&css_set_table[i]); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -2489,7 +2567,7 @@ int __init cgroup_init(void) { int err; int i; - struct proc_dir_entry *entry; + struct hlist_head *hhead; err = bdi_init(&cgroup_backing_dev_info); if (err) @@ -2501,13 +2579,15 @@ int __init cgroup_init(void) cgroup_init_subsys(ss); } + /* Add init_css_set to the hash table */ + hhead = css_set_hash(init_css_set.subsys); + hlist_add_head(&init_css_set.hlist, hhead); + err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; - entry = create_proc_entry("cgroups", 0, NULL); - if (entry) - entry->proc_fops = &proc_cgroupstats_operations; + proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); out: if (err) @@ -2561,6 +2641,7 @@ static int proc_cgroup_show(struct seq_file *m, void *v) /* Skip this hierarchy if it has no active subsystems */ if (!root->actual_subsys_bits) continue; + seq_printf(m, "%lu:", root->subsys_bits); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); seq_putc(m, ':'); @@ -2600,13 +2681,13 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) { int i; - seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n"); + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\n", + seq_printf(m, "%s\t%lu\t%d\t%d\n", ss->name, ss->root->subsys_bits, - ss->root->number_of_cgroups); + ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); return 0; @@ -2614,7 +2695,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) static int cgroupstats_open(struct inode *inode, struct file *file) { - return single_open(file, proc_cgroupstats_show, 0); + return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { @@ -2669,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child) } } +#ifdef CONFIG_MM_OWNER +/** + * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes + * @p: the new owner + * + * Called on every change to mm->owner. mm_init_owner() does not + * invoke this routine, since it assigns the mm->owner the first time + * and does not change it. + */ +void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) +{ + struct cgroup *oldcgrp, *newcgrp; + + if (need_mm_owner_callback) { + int i; + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + oldcgrp = task_cgroup(old, ss->subsys_id); + newcgrp = task_cgroup(new, ss->subsys_id); + if (oldcgrp == newcgrp) + continue; + if (ss->mm_owner_changed) + ss->mm_owner_changed(ss, oldcgrp, newcgrp); + } + } +} +#endif /* CONFIG_MM_OWNER */ + /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question @@ -2794,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) cg = tsk->cgroups; parent = task_cgroup(tsk, subsys->subsys_id); - snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); + snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid); /* Pin the hierarchy */ atomic_inc(&parent->root->sb->s_active); @@ -3010,3 +3119,27 @@ static void cgroup_release_agent(struct work_struct *work) spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } + +static int __init cgroup_disable(char *str) +{ + int i; + char *token; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + if (!strcmp(token, ss->name)) { + ss->disabled = 1; + printk(KERN_INFO "Disabling %s control group" + " subsystem\n", ss->name); + break; + } + } + } + return 1; +} +__setup("cgroup_disable=", cgroup_disable); diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index 37301e877cb..c3dc3aba4c0 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -1,5 +1,5 @@ /* - * kernel/ccontainer_debug.c - Example cgroup subsystem that + * kernel/cgroup_debug.c - Example cgroup subsystem that * exposes debug info * * Copyright (C) Google Inc, 2007 @@ -62,25 +62,35 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, return count; } +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + static struct cftype files[] = { { .name = "cgroup_refcount", - .read_uint = cgroup_refcount_read, + .read_u64 = cgroup_refcount_read, }, { .name = "taskcount", - .read_uint = taskcount_read, + .read_u64 = taskcount_read, }, { .name = "current_css_set", - .read_uint = current_css_set_read, + .read_u64 = current_css_set_read, }, { .name = "current_css_set_refcount", - .read_uint = current_css_set_refcount_read, + .read_u64 = current_css_set_refcount_read, }, + + { + .name = "releasable", + .read_u64 = releasable_read, + } }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/compat.c b/kernel/compat.c index 5f0e201bcfd..32c254a8ab9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -47,15 +47,14 @@ static long compat_nanosleep_restart(struct restart_block *restart) mm_segment_t oldfs; long ret; - rmtp = (struct compat_timespec __user *)(restart->arg1); - restart->arg1 = (unsigned long)&rmt; + restart->nanosleep.rmtp = (struct timespec __user *) &rmt; oldfs = get_fs(); set_fs(KERNEL_DS); ret = hrtimer_nanosleep_restart(restart); set_fs(oldfs); if (ret) { - restart->arg1 = (unsigned long)rmtp; + rmtp = restart->nanosleep.compat_rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -89,7 +88,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, = ¤t_thread_info()->restart_block; restart->fn = compat_nanosleep_restart; - restart->arg1 = (unsigned long)rmtp; + restart->nanosleep.compat_rmtp = rmtp; if (rmtp && put_compat_timespec(&rmt, rmtp)) return -EFAULT; @@ -446,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, @@ -607,9 +606,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) long err; mm_segment_t oldfs; struct timespec tu; - struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1); + struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; - restart->arg1 = (unsigned long) &tu; + restart->nanosleep.rmtp = (struct timespec __user *) &tu; oldfs = get_fs(); set_fs(KERNEL_DS); err = clock_nanosleep_restart(restart); @@ -621,7 +620,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart) if (err == -ERESTART_RESTARTBLOCK) { restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } @@ -652,7 +651,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, if (err == -ERESTART_RESTARTBLOCK) { restart = ¤t_thread_info()->restart_block; restart->fn = compat_clock_nanosleep_restart; - restart->arg1 = (unsigned long) rmtp; + restart->nanosleep.compat_rmtp = rmtp; } return err; } @@ -899,7 +898,7 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat current->state = TASK_INTERRUPTIBLE; schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); return -ERESTARTNOHAND; } #endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ @@ -956,7 +955,8 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) __put_user(txc.jitcnt, &utp->jitcnt) || __put_user(txc.calcnt, &utp->calcnt) || __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt)) + __put_user(txc.stbcnt, &utp->stbcnt) || + __put_user(txc.tai, &utp->tai)) ret = -EFAULT; return ret; @@ -1081,4 +1081,3 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } - diff --git a/kernel/configs.c b/kernel/configs.c index e84d3f9c6c7..4c345210ed8 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -79,12 +79,11 @@ static int __init ikconfig_init(void) struct proc_dir_entry *entry; /* create the current config file */ - entry = create_proc_entry("config.gz", S_IFREG | S_IRUGO, - &proc_root); + entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, + &ikconfig_file_ops); if (!entry) return -ENOMEM; - entry->proc_fops = &ikconfig_file_ops; entry->size = kernel_config_data_size; return 0; @@ -95,7 +94,7 @@ static int __init ikconfig_init(void) static void __exit ikconfig_cleanup(void) { - remove_proc_entry("config.gz", &proc_root); + remove_proc_entry("config.gz", NULL); } module_init(ikconfig_init); diff --git a/kernel/cpu.c b/kernel/cpu.c index 2eff3f63abe..c77bc3a1c72 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -33,17 +33,13 @@ static struct { * an ongoing cpu hotplug operation. */ int refcount; - wait_queue_head_t writer_queue; } cpu_hotplug; -#define writer_exists() (cpu_hotplug.active_writer != NULL) - void __init cpu_hotplug_init(void) { cpu_hotplug.active_writer = NULL; mutex_init(&cpu_hotplug.lock); cpu_hotplug.refcount = 0; - init_waitqueue_head(&cpu_hotplug.writer_queue); } #ifdef CONFIG_HOTPLUG_CPU @@ -65,11 +61,8 @@ void put_online_cpus(void) if (cpu_hotplug.active_writer == current) return; mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.refcount--; - - if (unlikely(writer_exists()) && !cpu_hotplug.refcount) - wake_up(&cpu_hotplug.writer_queue); - + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) + wake_up_process(cpu_hotplug.active_writer); mutex_unlock(&cpu_hotplug.lock); } @@ -98,8 +91,8 @@ void cpu_maps_update_done(void) * Note that during a cpu-hotplug operation, the new readers, if any, * will be blocked by the cpu_hotplug.lock * - * Since cpu_maps_update_begin is always called after invoking - * cpu_maps_update_begin, we can be sure that only one writer is active. + * Since cpu_hotplug_begin() is always called after invoking + * cpu_maps_update_begin(), we can be sure that only one writer is active. * * Note that theoretically, there is a possibility of a livelock: * - Refcount goes to zero, last reader wakes up the sleeping @@ -115,19 +108,16 @@ void cpu_maps_update_done(void) */ static void cpu_hotplug_begin(void) { - DECLARE_WAITQUEUE(wait, current); - - mutex_lock(&cpu_hotplug.lock); - cpu_hotplug.active_writer = current; - add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); - while (cpu_hotplug.refcount) { - set_current_state(TASK_UNINTERRUPTIBLE); + + for (;;) { + mutex_lock(&cpu_hotplug.lock); + if (likely(!cpu_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); mutex_unlock(&cpu_hotplug.lock); schedule(); - mutex_lock(&cpu_hotplug.lock); } - remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); } static void cpu_hotplug_done(void) @@ -136,7 +126,7 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } /* Need to know about CPUs going up/down? */ -int __cpuinit register_cpu_notifier(struct notifier_block *nb) +int __ref register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -149,7 +139,7 @@ int __cpuinit register_cpu_notifier(struct notifier_block *nb) EXPORT_SYMBOL(register_cpu_notifier); -void unregister_cpu_notifier(struct notifier_block *nb) +void __ref unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -180,7 +170,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int take_cpu_down(void *_param) +static int __ref take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -199,7 +189,7 @@ static int take_cpu_down(void *_param) } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; struct task_struct *p; @@ -225,16 +215,16 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", - __FUNCTION__, cpu); + __func__, cpu); err = -EINVAL; goto out_release; } /* Ensure that we are not runnable on dying cpu */ old_allowed = current->cpus_allowed; - tmp = CPU_MASK_ALL; + cpus_setall(tmp); cpu_clear(cpu, tmp); - set_cpus_allowed(current, tmp); + set_cpus_allowed_ptr(current, &tmp); p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); @@ -268,13 +258,13 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) out_thread: err = kthread_stop(p); out_allowed: - set_cpus_allowed(current, old_allowed); + set_cpus_allowed_ptr(current, &old_allowed); out_release: cpu_hotplug_done(); return err; } -int cpu_down(unsigned int cpu) +int __ref cpu_down(unsigned int cpu) { int err = 0; @@ -305,7 +295,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) if (ret == NOTIFY_BAD) { nr_calls--; printk("%s: attempt to bring up CPU %u failed\n", - __FUNCTION__, cpu); + __func__, cpu); ret = -EINVAL; goto out_notify; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f41422..9fceb97e989 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -98,6 +98,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + /* for custom sched domain */ + int relax_domain_level; + /* used for walking a cpuset heirarchy */ struct list_head stack_list; }; @@ -124,6 +127,7 @@ struct cpuset_hotplug_scanner { typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, @@ -141,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs) return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); @@ -478,6 +487,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) return cpus_intersects(a->cpus_allowed, b->cpus_allowed); } +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (!dattr) + return; + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; + return; +} + /* * rebuild_sched_domains() * @@ -553,12 +572,14 @@ static void rebuild_sched_domains(void) int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ q = NULL; csa = NULL; doms = NULL; + dattr = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { @@ -566,6 +587,11 @@ static void rebuild_sched_domains(void) doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr(dattr, &top_cpuset); + } *doms = top_cpuset.cpus_allowed; goto rebuild; } @@ -622,6 +648,7 @@ restart: doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; @@ -644,12 +671,15 @@ restart: } cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; + update_domain_attr(dattr, b); } } nslot++; @@ -660,7 +690,7 @@ restart: rebuild: /* Have scheduler rebuild sched domains */ get_online_cpus(); - partition_sched_domains(ndoms, doms); + partition_sched_domains(ndoms, doms, dattr); put_online_cpus(); done: @@ -668,6 +698,7 @@ done: kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ + /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } static inline int started_after_time(struct task_struct *t1, @@ -710,7 +741,8 @@ static inline int started_after(void *p1, void *p2) * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). */ -int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +static int cpuset_test_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) { return !cpus_equal(tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed); @@ -727,9 +759,10 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cgroup_lock() at this point. */ -void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) { - set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); + set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -764,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) return retval; + + if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) + return -EINVAL; } - cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); retval = validate_change(cs, &trialcs); if (retval < 0) return retval; @@ -899,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; + + if (!nodes_subset(trialcs.mems_allowed, + node_states[N_HIGH_MEMORY])) + return -EINVAL; } - nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, - node_states[N_HIGH_MEMORY]); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ @@ -916,7 +953,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); - cpuset_being_rebound = cs; /* causes mpol_copy() rebind */ + cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ @@ -967,7 +1004,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) * rebind the vma mempolicies of each mm in mmarray[] to their * new cpuset, and release that mm. The mpol_rebind_mm() * call takes mmap_sem, which we couldn't take while holding - * tasklist_lock. Forks can happen again now - the mpol_copy() + * tasklist_lock. Forks can happen again now - the mpol_dup() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global * cgroup_mutex, we know that no other rebind effort will @@ -998,40 +1035,35 @@ int current_cpuset_is_being_rebound(void) return task_cs(current) == cpuset_being_rebound; } -/* - * Call with cgroup_mutex held. - */ - -static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) +static int update_relax_domain_level(struct cpuset *cs, s64 val) { - if (simple_strtoul(buf, NULL, 10) != 0) - cpuset_memory_pressure_enabled = 1; - else - cpuset_memory_pressure_enabled = 0; + if (val < -1 || val >= SD_LV_MAX) + return -EINVAL; + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + rebuild_sched_domains(); + } + return 0; } /* * update_flag - read a 0 or a 1 in a file and update associated flag - * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_SCHED_LOAD_BALANCE, - * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, - * CS_SPREAD_PAGE, CS_SPREAD_SLAB) - * cs: the cpuset to update - * buf: the buffer where we read the 0 or 1 + * bit: the bit to update (see cpuset_flagbits_t) + * cs: the cpuset to update + * turning_on: whether the flag is being set or cleared * * Call with cgroup_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) { - int turning_on; struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; - turning_on = (simple_strtoul(buf, NULL, 10) != 0); - trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); @@ -1178,7 +1210,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed(tsk, cpus); + set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); from = oldcs->mems_allowed; @@ -1201,7 +1233,9 @@ typedef enum { FILE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, + FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, @@ -1224,7 +1258,8 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, return -E2BIG; /* +1 for nul-terminator */ - if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buffer) return -ENOMEM; if (copy_from_user(buffer, userbuf, nbytes)) { @@ -1247,43 +1282,92 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, case FILE_MEMLIST: retval = update_nodemask(cs, buffer); break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + cgroup_unlock(); +out1: + kfree(buffer); + return retval; +} + +static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + + switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: - retval = update_memory_pressure_enabled(cs, buffer); + cpuset_memory_pressure_enabled = !!val; break; case FILE_MEMORY_PRESSURE: retval = -EACCES; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + retval = update_flag(CS_SPREAD_PAGE, cs, val); cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + retval = update_flag(CS_SPREAD_SLAB, cs, val); cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; - goto out2; + break; } + cgroup_unlock(); + return retval; +} - if (retval == 0) - retval = nbytes; -out2: +static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } cgroup_unlock(); -out1: - kfree(buffer); return retval; } @@ -1345,30 +1429,6 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; - case FILE_CPU_EXCLUSIVE: - *s++ = is_cpu_exclusive(cs) ? '1' : '0'; - break; - case FILE_MEM_EXCLUSIVE: - *s++ = is_mem_exclusive(cs) ? '1' : '0'; - break; - case FILE_SCHED_LOAD_BALANCE: - *s++ = is_sched_load_balance(cs) ? '1' : '0'; - break; - case FILE_MEMORY_MIGRATE: - *s++ = is_memory_migrate(cs) ? '1' : '0'; - break; - case FILE_MEMORY_PRESSURE_ENABLED: - *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; - break; - case FILE_MEMORY_PRESSURE: - s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); - break; - case FILE_SPREAD_PAGE: - *s++ = is_spread_page(cs) ? '1' : '0'; - break; - case FILE_SPREAD_SLAB: - *s++ = is_spread_slab(cs) ? '1' : '0'; - break; default: retval = -EINVAL; goto out; @@ -1381,111 +1441,149 @@ out: return retval; } +static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } +} - +static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } +} /* * for the common functions, 'private' gives the type of file */ -static struct cftype cft_cpus = { - .name = "cpus", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_CPULIST, -}; - -static struct cftype cft_mems = { - .name = "mems", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMLIST, -}; - -static struct cftype cft_cpu_exclusive = { - .name = "cpu_exclusive", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_CPU_EXCLUSIVE, -}; - -static struct cftype cft_mem_exclusive = { - .name = "mem_exclusive", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEM_EXCLUSIVE, -}; - -static struct cftype cft_sched_load_balance = { - .name = "sched_load_balance", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SCHED_LOAD_BALANCE, -}; - -static struct cftype cft_memory_migrate = { - .name = "memory_migrate", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMORY_MIGRATE, +static struct cftype files[] = { + { + .name = "cpus", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, }; static struct cftype cft_memory_pressure_enabled = { .name = "memory_pressure_enabled", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, .private = FILE_MEMORY_PRESSURE_ENABLED, }; -static struct cftype cft_memory_pressure = { - .name = "memory_pressure", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_MEMORY_PRESSURE, -}; - -static struct cftype cft_spread_page = { - .name = "memory_spread_page", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SPREAD_PAGE, -}; - -static struct cftype cft_spread_slab = { - .name = "memory_spread_slab", - .read = cpuset_common_file_read, - .write = cpuset_common_file_write, - .private = FILE_SPREAD_SLAB, -}; - static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; - if ((err = cgroup_add_file(cont, ss, &cft_cpus)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_mems)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_cpu_exclusive)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_mem_exclusive)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) - return err; - if ((err = cgroup_add_file(cont, ss, &cft_spread_slab)) < 0) + err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + if (err) return err; /* memory_pressure_enabled is in root cpuset only */ - if (err == 0 && !cont->parent) + if (!cont->parent) err = cgroup_add_file(cont, ss, - &cft_memory_pressure_enabled); - return 0; + &cft_memory_pressure_enabled); + return err; } /* @@ -1555,10 +1653,11 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cs->cpus_allowed = CPU_MASK_NONE; - cs->mems_allowed = NODE_MASK_NONE; + cpus_clear(cs->cpus_allowed); + nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; cs->parent = parent; number_of_cpusets++; @@ -1584,7 +1683,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) cpuset_update_task_memory_state(); if (is_sched_load_balance(cs)) - update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); number_of_cpusets--; kfree(cs); @@ -1625,12 +1724,13 @@ int __init cpuset_init(void) { int err = 0; - top_cpuset.cpus_allowed = CPU_MASK_ALL; - top_cpuset.mems_allowed = NODE_MASK_ALL; + cpus_setall(top_cpuset.cpus_allowed); + nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); + top_cpuset.relax_domain_level = -1; err = register_filesystem(&cpuset_fs_type); if (err < 0) @@ -1648,7 +1748,8 @@ int __init cpuset_init(void) * Called by cgroup_scan_tasks() for each task in a cgroup. * Return nonzero to stop the walk through the tasks. */ -void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) +static void cpuset_do_move_task(struct task_struct *tsk, + struct cgroup_scanner *scan) { struct cpuset_hotplug_scanner *chsp; @@ -1789,6 +1890,12 @@ static void common_cpu_mem_hotplug_unplug(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); + /* + * Scheduler destroys domains on hotplug events. + * Rebuild them based on the current settings. + */ + rebuild_sched_domains(); + cgroup_unlock(); } @@ -1844,6 +1951,7 @@ void __init cpuset_init_smp(void) * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -1851,35 +1959,27 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) +void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - mutex_lock(&callback_mutex); - mask = cpuset_cpus_allowed_locked(tsk); + cpuset_cpus_allowed_locked(tsk, pmask); mutex_unlock(&callback_mutex); - - return mask; } /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), &mask); + guarantee_online_cpus(task_cs(tsk), pmask); task_unlock(tsk); - - return mask; } void cpuset_init_current_mems_allowed(void) { - current->mems_allowed = NODE_MASK_ALL; + nodes_setall(current->mems_allowed); } /** @@ -1906,33 +2006,25 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) } /** - * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed - * @zl: the zonelist to be checked + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * @nodemask: the nodemask to be checked * - * Are any of the nodes on zonelist zl allowed in current->mems_allowed? + * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ -int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { - int i; - - for (i = 0; zl->zones[i]; i++) { - int nid = zone_to_nid(zl->zones[i]); - - if (node_isset(nid, current->mems_allowed)) - return 1; - } - return 0; + return nodes_intersects(*nodemask, current->mems_allowed); } /* - * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive - * ancestor to the specified cpuset. Call holding callback_mutex. - * If no ancestor is mem_exclusive (an unusual configuration), then - * returns the root cpuset. + * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or + * mem_hardwall ancestor to the specified cpuset. Call holding + * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) { - while (!is_mem_exclusive(cs) && cs->parent) + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) cs = cs->parent; return cs; } @@ -1946,7 +2038,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest - * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * hardwalled cpuset ancestor to this tasks cpuset, yes. * If the task has been OOM killed and has access to memory reserves * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. @@ -1969,7 +2061,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the - * nearest enclosing mem_exclusive ancestor cpuset. + * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_mutex. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit @@ -1992,7 +2084,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * TIF_MEMDIE - any node ok - * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: @@ -2029,7 +2121,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) mutex_lock(&callback_mutex); task_lock(current); - cs = nearest_exclusive_ancestor(task_cs(current)); + cs = nearest_hardwall_ancestor(task_cs(current)); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); @@ -2261,8 +2353,16 @@ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, task->cpus_allowed); seq_printf(m, "\n"); + seq_printf(m, "Cpus_allowed_list:\t"); + m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, + task->cpus_allowed); + seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, task->mems_allowed); seq_printf(m, "\n"); + seq_printf(m, "Mems_allowed_list:\t"); + m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, + task->mems_allowed); + seq_printf(m, "\n"); } diff --git a/kernel/dma.c b/kernel/dma.c index 6a82bb716da..d2c60a82279 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -149,12 +149,7 @@ static const struct file_operations proc_dma_operations = { static int __init proc_dma_init(void) { - struct proc_dir_entry *e; - - e = create_proc_entry("dma", 0, NULL); - if (e) - e->proc_fops = &proc_dma_operations; - + proc_create("dma", 0, NULL, &proc_dma_operations); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 53872bf993f..8f6185e69b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -19,6 +19,7 @@ #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> @@ -52,6 +53,11 @@ static void exit_mm(struct task_struct * tsk); +static inline int task_detached(struct task_struct *p) +{ + return p->exit_signal == -1; +} + static void __unhash_process(struct task_struct *p) { nr_threads--; @@ -120,6 +126,12 @@ static void __exit_signal(struct task_struct *tsk) __unhash_process(tsk); + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); @@ -127,7 +139,6 @@ static void __exit_signal(struct task_struct *tsk) __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - flush_sigqueue(&tsk->pending); if (sig) { flush_sigqueue(&sig->shared_pending); taskstats_tgid_free(sig); @@ -160,7 +171,7 @@ repeat: zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { - BUG_ON(leader->exit_signal == -1); + BUG_ON(task_detached(leader)); do_notify_parent(leader, leader->exit_signal); /* * If we were the last child thread and the leader has @@ -170,7 +181,7 @@ repeat: * do_notify_parent() will have marked it self-reaping in * that case. */ - zap_leader = (leader->exit_signal == -1); + zap_leader = task_detached(leader); } write_unlock_irq(&tasklist_lock); @@ -329,13 +340,11 @@ void __set_special_pids(struct pid *pid) pid_t nr = pid_nr(pid); if (task_session(curr) != pid) { - detach_pid(curr, PIDTYPE_SID); - attach_pid(curr, PIDTYPE_SID, pid); + change_pid(curr, PIDTYPE_SID, pid); set_task_session(curr, nr); } if (task_pgrp(curr) != pid) { - detach_pid(curr, PIDTYPE_PGID); - attach_pid(curr, PIDTYPE_PGID, pid); + change_pid(curr, PIDTYPE_PGID, pid); set_task_pgrp(curr, nr); } } @@ -507,10 +516,9 @@ void put_files_struct(struct files_struct *files) } } -EXPORT_SYMBOL(put_files_struct); - -void reset_files_struct(struct task_struct *tsk, struct files_struct *files) +void reset_files_struct(struct files_struct *files) { + struct task_struct *tsk = current; struct files_struct *old; old = tsk->files; @@ -519,9 +527,8 @@ void reset_files_struct(struct task_struct *tsk, struct files_struct *files) task_unlock(tsk); put_files_struct(old); } -EXPORT_SYMBOL(reset_files_struct); -static void __exit_files(struct task_struct *tsk) +void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; @@ -533,12 +540,7 @@ static void __exit_files(struct task_struct *tsk) } } -void exit_files(struct task_struct *tsk) -{ - __exit_files(tsk); -} - -static void __put_fs_struct(struct fs_struct *fs) +void put_fs_struct(struct fs_struct *fs) { /* No need to hold fs->lock if we are killing it */ if (atomic_dec_and_test(&fs->count)) { @@ -550,12 +552,7 @@ static void __put_fs_struct(struct fs_struct *fs) } } -void put_fs_struct(struct fs_struct *fs) -{ - __put_fs_struct(fs); -} - -static void __exit_fs(struct task_struct *tsk) +void exit_fs(struct task_struct *tsk) { struct fs_struct * fs = tsk->fs; @@ -563,16 +560,93 @@ static void __exit_fs(struct task_struct *tsk) task_lock(tsk); tsk->fs = NULL; task_unlock(tsk); - __put_fs_struct(fs); + put_fs_struct(fs); } } -void exit_fs(struct task_struct *tsk) +EXPORT_SYMBOL_GPL(exit_fs); + +#ifdef CONFIG_MM_OWNER +/* + * Task p is exiting and it owned mm, lets find a new owner for it + */ +static inline int +mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) { - __exit_fs(tsk); + /* + * If there are other users of the mm and the owner (us) is exiting + * we need to find a new owner to take on the responsibility. + */ + if (!mm) + return 0; + if (atomic_read(&mm->mm_users) <= 1) + return 0; + if (mm->owner != p) + return 0; + return 1; } -EXPORT_SYMBOL_GPL(exit_fs); +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + if (!mm_need_new_owner(mm, p)) + return; + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + cgroup_mm_owner_callbacks(mm->owner, c); + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ /* * Turn us into a lazy TLB process if we @@ -613,6 +687,7 @@ static void exit_mm(struct task_struct * tsk) /* We don't want this task to be frozen prematurely */ clear_freeze_flag(tsk); task_unlock(tsk); + mm_update_next_owner(mm); mmput(mm); } @@ -627,7 +702,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) if (unlikely(traced)) { /* Preserve ptrace links if someone else is tracing this child. */ list_del_init(&p->ptrace_list); - if (p->parent != p->real_parent) + if (ptrace_reparented(p)) list_add(&p->ptrace_list, &p->real_parent->ptrace_children); } else { /* If this child is being traced, then we're the one tracing it @@ -651,18 +726,18 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) /* If this is a threaded reparent there is no need to * notify anyone anything has happened. */ - if (p->real_parent->group_leader == father->group_leader) + if (same_thread_group(p->real_parent, father)) return; /* We don't want people slaying init. */ - if (p->exit_signal != -1) + if (!task_detached(p)) p->exit_signal = SIGCHLD; /* If we'd notified the old parent about this child's death, * also notify the new parent. */ if (!traced && p->exit_state == EXIT_ZOMBIE && - p->exit_signal != -1 && thread_group_empty(p)) + !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); kill_orphaned_pgrp(p, father); @@ -715,18 +790,18 @@ static void forget_original_parent(struct task_struct *father) } else { /* reparent ptraced task to its real parent */ __ptrace_unlink (p); - if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && + if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); } /* - * if the ptraced child is a zombie with exit_signal == -1 - * we must collect it before we exit, or it will remain - * zombie forever since we prevented it from self-reap itself - * while it was being traced by us, to be able to see it in wait4. + * if the ptraced child is a detached zombie we must collect + * it before we exit, or it will remain zombie forever since + * we prevented it from self-reap itself while it was being + * traced by us, to be able to see it in wait4. */ - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1)) + if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) list_add(&p->ptrace_list, &ptrace_dead); } @@ -783,29 +858,30 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * we have changed execution domain as these two values started * the same after a fork. */ - if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && + if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) - && !capable(CAP_KILL)) + tsk->self_exec_id != tsk->parent_exec_id) && + !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then * send it a SIGCHLD instead of honoring exit_signal. exit_signal * only has special meaning to our real parent. */ - if (tsk->exit_signal != -1 && thread_group_empty(tsk)) { - int signal = tsk->parent == tsk->real_parent ? tsk->exit_signal : SIGCHLD; + if (!task_detached(tsk) && thread_group_empty(tsk)) { + int signal = ptrace_reparented(tsk) ? + SIGCHLD : tsk->exit_signal; do_notify_parent(tsk, signal); } else if (tsk->ptrace) { do_notify_parent(tsk, SIGCHLD); } state = EXIT_ZOMBIE; - if (tsk->exit_signal == -1 && likely(!tsk->ptrace)) + if (task_detached(tsk) && likely(!tsk->ptrace)) state = EXIT_DEAD; tsk->exit_state = state; + /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && tsk->signal->notify_count < 0 && tsk->signal->group_exit_task) @@ -967,8 +1043,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); exit_sem(tsk); - __exit_files(tsk); - __exit_fs(tsk); + exit_files(tsk); + exit_fs(tsk); check_stack_usage(); exit_thread(); cgroup_exit(tsk, 1); @@ -984,7 +1060,7 @@ NORET_TYPE void do_exit(long code) proc_exit_connector(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA - mpol_free(tsk->mempolicy); + mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif #ifdef CONFIG_FUTEX @@ -1049,12 +1125,13 @@ asmlinkage long sys_exit(int error_code) NORET_TYPE void do_group_exit(int exit_code) { + struct signal_struct *sig = current->signal; + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) - exit_code = current->signal->group_exit_code; + if (signal_group_exit(sig)) + exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { - struct signal_struct *const sig = current->signal; struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) @@ -1106,7 +1183,7 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options, * Do not consider detached threads that are * not ptraced: */ - if (p->exit_signal == -1 && !p->ptrace) + if (task_detached(p) && !p->ptrace) return 0; /* Wait for all children (clone and not) if __WALL is set; @@ -1196,8 +1273,7 @@ static int wait_task_zombie(struct task_struct *p, int noreap, return 0; } - /* traced means p->ptrace, but not vice versa */ - traced = (p->real_parent != p->parent); + traced = ptrace_reparented(p); if (likely(!traced)) { struct signal_struct *psig; @@ -1298,9 +1374,9 @@ static int wait_task_zombie(struct task_struct *p, int noreap, * If it's still not detached after that, don't release * it now. */ - if (p->exit_signal != -1) { + if (!task_detached(p)) { do_notify_parent(p, p->exit_signal); - if (p->exit_signal != -1) { + if (!task_detached(p)) { p->exit_state = EXIT_ZOMBIE; p = NULL; } @@ -1608,7 +1684,7 @@ asmlinkage long sys_waitid(int which, pid_t upid, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(5, ret, which, upid, infop, options, ru); return ret; } @@ -1640,7 +1716,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, put_pid(pid); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a..19908b26cf8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -22,6 +22,7 @@ #include <linux/mempolicy.h> #include <linux/sem.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/key.h> #include <linux/binfmts.h> #include <linux/mman.h> @@ -132,6 +133,14 @@ void __put_task_struct(struct task_struct *tsk) free_task(tsk); } +/* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. + */ +#ifndef arch_task_cache_init +#define arch_task_cache_init() +#endif + void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR @@ -144,6 +153,9 @@ void __init fork_init(unsigned long mempages) ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); #endif + /* do the arch specific task caches init */ + arch_task_cache_init(); + /* * The default maximum number of threads is set to a safe * value: the thread structures can take up at most half @@ -163,6 +175,13 @@ void __init fork_init(unsigned long mempages) init_task.signal->rlim[RLIMIT_NPROC]; } +int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + *dst = *src; + return 0; +} + static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; @@ -181,15 +200,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; } - *tsk = *orig; + err = arch_dup_task_struct(tsk, orig); + if (err) + goto out; + tsk->stack = ti; err = prop_local_init_single(&tsk->dirties); - if (err) { - free_thread_info(ti); - free_task_struct(tsk); - return NULL; - } + if (err) + goto out; setup_thread_stack(tsk, orig); @@ -205,6 +224,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; return tsk; + +out: + free_thread_info(ti); + free_task_struct(tsk); + return NULL; } #ifdef CONFIG_MMU @@ -256,7 +280,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; - pol = mpol_copy(vma_policy(mpnt)); + pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; @@ -358,14 +382,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) mm->ioctx_list = NULL; mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; - mm_init_cgroup(mm, p); + mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; return mm; } - mm_free_cgroup(mm); free_mm(mm); return NULL; } @@ -394,7 +417,6 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -410,6 +432,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); + set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); @@ -498,7 +521,7 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * Allocate a new mm structure and copy contents from the * mm structure of the passed in task structure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +struct mm_struct *dup_mm(struct task_struct *tsk) { struct mm_struct *mm, *oldmm = current->mm; int err; @@ -522,6 +545,8 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) if (init_new_context(tsk, mm)) goto fail_nocontext; + dup_mm_exe_file(oldmm, mm); + err = dup_mmap(mm, oldmm); if (err) goto free_pt; @@ -635,136 +660,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int count_open_files(struct fdtable *fdt) -{ - int size = fdt->max_fds; - int i; - - /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) - break; - } - i = (i+1) * 8 * sizeof(long); - return i; -} - -static struct files_struct *alloc_files(void) -{ - struct files_struct *newf; - struct fdtable *fdt; - - newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); - if (!newf) - goto out; - - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - fdt = &newf->fdtab; - fdt->max_fds = NR_OPEN_DEFAULT; - fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - fdt->open_fds = (fd_set *)&newf->open_fds_init; - fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&fdt->rcu); - fdt->next = NULL; - rcu_assign_pointer(newf->fdt, fdt); -out: - return newf; -} - -/* - * Allocate a new files structure and copy contents from the - * passed in files structure. - * errorp will be valid only when the returned files_struct is NULL. - */ -static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) -{ - struct files_struct *newf; - struct file **old_fds, **new_fds; - int open_files, size, i; - struct fdtable *old_fdt, *new_fdt; - - *errorp = -ENOMEM; - newf = alloc_files(); - if (!newf) - goto out; - - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - new_fdt = files_fdtable(newf); - open_files = count_open_files(old_fdt); - - /* - * Check whether we need to allocate a larger fd array and fd set. - * Note: we're not a clone task, so the open count won't change. - */ - if (open_files > new_fdt->max_fds) { - new_fdt->max_fds = 0; - spin_unlock(&oldf->file_lock); - spin_lock(&newf->file_lock); - *errorp = expand_files(newf, open_files-1); - spin_unlock(&newf->file_lock); - if (*errorp < 0) - goto out_release; - new_fdt = files_fdtable(newf); - /* - * Reacquire the oldf lock and a pointer to its fd table - * who knows it may have a new bigger fd table. We need - * the latest pointer. - */ - spin_lock(&oldf->file_lock); - old_fdt = files_fdtable(oldf); - } - - old_fds = old_fdt->fd; - new_fds = new_fdt->fd; - - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); - - for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; - if (f) { - get_file(f); - } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ - FD_CLR(open_files - i, new_fdt->open_fds); - } - rcu_assign_pointer(*new_fds++, f); - } - spin_unlock(&oldf->file_lock); - - /* compute the remainder to be cleared */ - size = (new_fdt->max_fds - open_files) * sizeof(struct file *); - - /* This is long word aligned thus could use a optimized version */ - memset(new_fds, 0, size); - - if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); - - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); - } - - return newf; - -out_release: - kmem_cache_free(files_cachep, newf); -out: - return NULL; -} - static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; @@ -782,12 +677,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) goto out; } - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -823,34 +712,6 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return 0; } -/* - * Helper to unshare the files of the current task. - * We don't want to expose copy_files internals to - * the exec layer of the kernel. - */ - -int unshare_files(void) -{ - struct files_struct *files = current->files; - int rc; - - BUG_ON(!files); - - /* This can race but the race causes us to copy when we don't - need to and drop the copy */ - if(atomic_read(&files->count) == 1) - { - atomic_inc(&files->count); - return 0; - } - rc = copy_files(0, current); - if(rc) - current->files = files; - return rc; -} - -EXPORT_SYMBOL(unshare_files); - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -902,7 +763,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->group_exit_code = 0; sig->group_exit_task = NULL; sig->group_stop_count = 0; - sig->curr_target = NULL; + sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); @@ -993,6 +854,13 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1127,7 +995,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->audit_context = NULL; cgroup_fork(p); #ifdef CONFIG_NUMA - p->mempolicy = mpol_copy(p->mempolicy); + p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; @@ -1385,7 +1253,7 @@ bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_policy: #ifdef CONFIG_NUMA - mpol_free(p->mempolicy); + mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); @@ -1675,18 +1543,6 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp } /* - * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not - * supported yet - */ -static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) -{ - if (unshare_flags & CLONE_SYSVSEM) - return -EINVAL; - - return 0; -} - -/* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by do_fork() cannot be used here directly @@ -1701,8 +1557,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct sighand_struct *new_sigh = NULL; struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; - struct sem_undo_list *new_ulist = NULL; struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; check_unshare_flags(&unshare_flags); @@ -1714,6 +1570,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) CLONE_NEWNET)) goto bad_unshare_out; + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) + do_sysvsem = 1; if ((err = unshare_thread(unshare_flags))) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) @@ -1724,13 +1587,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_sigh; if ((err = unshare_fd(unshare_flags, &new_fd))) goto bad_unshare_cleanup_vm; - if ((err = unshare_semundo(unshare_flags, &new_ulist))) - goto bad_unshare_cleanup_fd; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) - goto bad_unshare_cleanup_semundo; + goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { + if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } if (new_nsproxy) { switch_task_namespaces(current, new_nsproxy); @@ -1766,7 +1633,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_nsproxy) put_nsproxy(new_nsproxy); -bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); @@ -1788,3 +1654,27 @@ bad_unshare_cleanup_thread: bad_unshare_out: return err; } + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(struct files_struct **displaced) +{ + struct task_struct *task = current; + struct files_struct *copy = NULL; + int error; + + error = unshare_fd(CLONE_FILES, ©); + if (error || !copy) { + *displaced = NULL; + return error; + } + *displaced = task->files; + task_lock(task); + task->files = copy; + task_unlock(task); + return 0; +} diff --git a/kernel/futex.c b/kernel/futex.c index 06968cd7920..7d1136e97c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -104,10 +104,6 @@ struct futex_q { /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these: */ - int fd; - struct file *filp; - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; @@ -126,9 +122,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; -/* Futex-fs vfsmount entry: */ -static struct vfsmount *futex_mnt; - /* * Take mm->mmap_sem, when futex is shared */ @@ -281,7 +274,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, */ static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -610,8 +603,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static void wake_futex(struct futex_q *q) { plist_del(&q->list, &q->list.plist); - if (q->filp) - send_sigio(&q->filp->f_owner, q->fd, POLL_IN); /* * The lock in wake_up_all() is a crucial memory barrier after the * plist_del() and also before assigning to q->lock_ptr. @@ -988,14 +979,10 @@ out: } /* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket * -queue_lock(struct futex_q *q, int fd, struct file *filp) +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - q->fd = fd; - q->filp = filp; - init_waitqueue_head(&q->waiters); get_futex_key_refs(&q->key); @@ -1006,7 +993,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1041,15 +1028,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * exactly once. They are called with the hashed spinlock held. */ -/* The key must be already stored in q->key. */ -static void queue_me(struct futex_q *q, int fd, struct file *filp) -{ - struct futex_hash_bucket *hb; - - hb = queue_lock(q, fd, filp); - __queue_me(q, hb); -} - /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { @@ -1118,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q) * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *newowner, + struct rw_semaphore *fshared) { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (curval == -EFAULT) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ if (pi_state->owner != NULL) { spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); spin_unlock_irq(&pi_state->owner->pi_lock); - } else - newtid |= FUTEX_OWNER_DIED; + } pi_state->owner = newowner; @@ -1140,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); spin_unlock_irq(&newowner->pi_lock); + return 0; /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. */ - ret = get_futex_value_locked(&uval, uaddr); +handle_fault: + spin_unlock(q->lock_ptr); - while (!ret) { - newval = (uval & FUTEX_OWNER_DIED) | newtid; + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + spin_lock(q->lock_ptr); - if (curval == -EFAULT) - ret = -EFAULT; - if (curval == uval) - break; - uval = curval; - } - return ret; + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* @@ -1194,7 +1224,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret != 0)) goto out_release_sem; - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); /* * Access the page AFTER the futex is queued. @@ -1238,7 +1268,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1266,11 +1296,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (!abs_time) schedule(); else { - hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); t.timer.expires = *abs_time; - hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + hrtimer_start(&t.timer, t.timer.expires, + HRTIMER_MODE_ABS); if (!hrtimer_active(&t.timer)) t.task = NULL; @@ -1286,6 +1318,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, /* Flag if a timeout occured */ rem = (t.task == NULL); + + destroy_hrtimer_on_stack(&t.timer); } } __set_current_state(TASK_RUNNING); @@ -1367,7 +1401,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (time) { to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); to->timer.expires = *time; } @@ -1381,7 +1416,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_release_sem; retry_unlocked: - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); retry_locked: ret = lock_taken = 0; @@ -1494,7 +1529,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* * Only actually queue now that the atomic ops are done: */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1524,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released @@ -1556,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, int res; owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); /* propagate -EFAULT, if the fixup failed */ if (res) @@ -1581,6 +1617,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, unqueue_me_pi(&q); futex_unlock_mm(fshared); + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; out_unlock_release_sem: @@ -1588,6 +1626,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: futex_unlock_mm(fshared); + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret; uaddr_faulted: @@ -1615,6 +1655,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (!ret && (uval != -EFAULT)) goto retry; + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret; } @@ -1735,121 +1777,6 @@ pi_faulted: return ret; } -static int futex_close(struct inode *inode, struct file *filp) -{ - struct futex_q *q = filp->private_data; - - unqueue_me(q); - kfree(q); - - return 0; -} - -/* This is one-shot: once it's gone off you need a new fd */ -static unsigned int futex_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct futex_q *q = filp->private_data; - int ret = 0; - - poll_wait(filp, &q->waiters, wait); - - /* - * plist_node_empty() is safe here without any lock. - * q->lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (plist_node_empty(&q->list)) - ret = POLLIN | POLLRDNORM; - - return ret; -} - -static const struct file_operations futex_fops = { - .release = futex_close, - .poll = futex_poll, -}; - -/* - * Signal allows caller to avoid the race which would occur if they - * set the sigio stuff up afterwards. - */ -static int futex_fd(u32 __user *uaddr, int signal) -{ - struct futex_q *q; - struct file *filp; - int ret, err; - struct rw_semaphore *fshared; - static unsigned long printk_interval; - - if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { - printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); - } - - ret = -EINVAL; - if (!valid_signal(signal)) - goto out; - - ret = get_unused_fd(); - if (ret < 0) - goto out; - filp = get_empty_filp(); - if (!filp) { - put_unused_fd(ret); - ret = -ENFILE; - goto out; - } - filp->f_op = &futex_fops; - filp->f_path.mnt = mntget(futex_mnt); - filp->f_path.dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; - - if (signal) { - err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); - if (err < 0) { - goto error; - } - filp->f_owner.signum = signal; - } - - q = kmalloc(sizeof(*q), GFP_KERNEL); - if (!q) { - err = -ENOMEM; - goto error; - } - q->pi_state = NULL; - - fshared = ¤t->mm->mmap_sem; - down_read(fshared); - err = get_futex_key(uaddr, fshared, &q->key); - - if (unlikely(err != 0)) { - up_read(fshared); - kfree(q); - goto error; - } - - /* - * queue_me() must be called before releasing mmap_sem, because - * key->shared.inode needs to be referenced while holding it. - */ - filp->private_data = q; - - queue_me(q, ret, filp); - up_read(fshared); - - /* Now we map fd to filp, so userspace can access it */ - fd_install(ret, filp); -out: - return ret; -error: - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; -} - /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -2081,10 +2008,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_BITSET: ret = futex_wake(uaddr, fshared, val, val3); break; - case FUTEX_FD: - /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ - ret = futex_fd(uaddr, val); - break; case FUTEX_REQUEUE: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); break; @@ -2145,20 +2068,7 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt); -} - -static struct file_system_type futex_fs_type = { - .name = "futexfs", - .get_sb = futexfs_get_sb, - .kill_sb = kill_anon_super, -}; - -static int __init init(void) +static int __init futex_init(void) { u32 curval; int i; @@ -2182,16 +2092,6 @@ static int __init init(void) spin_lock_init(&futex_queues[i].lock); } - i = register_filesystem(&futex_fs_type); - if (i) - return i; - - futex_mnt = kern_mount(&futex_fs_type); - if (IS_ERR(futex_mnt)) { - unregister_filesystem(&futex_fs_type); - return PTR_ERR(futex_mnt); - } - return 0; } -__initcall(init); +__initcall(futex_init); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index ff90f049f8f..04ac3a9e42c 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } -static void __user *futex_uaddr(struct robust_list *entry, +static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 98bee013f71..543d9ca9b4f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -43,6 +43,7 @@ #include <linux/tick.h> #include <linux/seq_file.h> #include <linux/err.h> +#include <linux/debugobjects.h> #include <asm/uaccess.h> @@ -153,15 +154,6 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) } /* - * Helper function to check, whether the timer is running the callback - * function - */ -static inline int hrtimer_callback_running(struct hrtimer *timer) -{ - return timer->state & HRTIMER_STATE_CALLBACK; -} - -/* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ @@ -342,6 +334,115 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) return res; } +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr hrtimer_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_free(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +#else +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + /* * Check, whether the timer is on the callback pending list */ @@ -567,6 +668,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* Timer is expired, act upon the callback mode */ switch(timer->cb_mode) { case HRTIMER_CB_IRQSAFE_NO_RESTART: + debug_hrtimer_deactivate(timer); /* * We can call the callback from here. No restart * happens, so no danger of recursion @@ -581,6 +683,7 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, * the tick timer in the softirq ! The calling site * takes care of this. */ + debug_hrtimer_deactivate(timer); return 1; case HRTIMER_CB_IRQSAFE: case HRTIMER_CB_SOFTIRQ: @@ -590,7 +693,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, list_add_tail(&timer->cb_entry, &base->cpu_base->cb_pending); timer->state = HRTIMER_STATE_PENDING; - raise_softirq(HRTIMER_SOFTIRQ); return 1; default: BUG(); @@ -633,6 +735,11 @@ static int hrtimer_switch_to_hres(void) return 1; } +static inline void hrtimer_raise_softirq(void) +{ + raise_softirq(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -651,6 +758,7 @@ static inline int hrtimer_reprogram(struct hrtimer *timer, { return 0; } +static inline void hrtimer_raise_softirq(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -730,6 +838,8 @@ static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; + debug_hrtimer_activate(timer); + /* * Find the right place in the rbtree: */ @@ -826,6 +936,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ + debug_hrtimer_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -850,7 +961,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret; + int ret, raise; base = lock_hrtimer_base(timer, &flags); @@ -873,6 +984,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) tim = ktime_add_safe(tim, base->resolution); #endif } + timer->expires = tim; timer_stats_hrtimer_set_start_info(timer); @@ -884,8 +996,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) enqueue_hrtimer(timer, new_base, new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); + /* + * The timer may be expired and moved to the cb_pending + * list. We can not raise the softirq with base lock held due + * to a possible deadlock with runqueue lock. + */ + raise = timer->state == HRTIMER_STATE_PENDING; + unlock_hrtimer_base(timer, &flags); + if (raise) + hrtimer_raise_softirq(); + return ret; } EXPORT_SYMBOL_GPL(hrtimer_start); @@ -956,7 +1078,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /** * hrtimer_get_next_event - get the time until next expiry event * @@ -996,14 +1118,8 @@ ktime_t hrtimer_get_next_event(void) } #endif -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: timer mode abs/rel - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; @@ -1024,6 +1140,19 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer->start_comm, 0, TASK_COMM_LEN); #endif } + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + __hrtimer_init(timer, clock_id, mode); +} EXPORT_SYMBOL_GPL(hrtimer_init); /** @@ -1057,6 +1186,7 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) timer = list_entry(cpu_base->cb_pending.next, struct hrtimer, cb_entry); + debug_hrtimer_deactivate(timer); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1080,8 +1210,19 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) * If the timer was rearmed on another CPU, reprogram * the event device. */ - if (timer->base->first == &timer->node) - hrtimer_reprogram(timer, timer->base); + struct hrtimer_clock_base *base = timer->base; + + if (base->first == &timer->node && + hrtimer_reprogram(timer, base)) { + /* + * Timer is expired. Thus move it from tree to + * pending list again. + */ + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + } } } spin_unlock_irq(&cpu_base->lock); @@ -1094,6 +1235,7 @@ static void __run_hrtimer(struct hrtimer *timer) enum hrtimer_restart (*fn)(struct hrtimer *); int restart; + debug_hrtimer_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); @@ -1238,51 +1380,50 @@ void hrtimer_run_pending(void) /* * Called from hardirq context every jiffy */ -static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, - int index) +void hrtimer_run_queues(void) { struct rb_node *node; - struct hrtimer_clock_base *base = &cpu_base->clock_base[index]; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + int index, gettime = 1; - if (!base->first) + if (hrtimer_hres_active()) return; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - - spin_lock(&cpu_base->lock); - - while ((node = base->first)) { - struct hrtimer *timer; - - timer = rb_entry(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= timer->expires.tv64) - break; + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; - if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { - __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); - list_add_tail(&timer->cb_entry, - &base->cpu_base->cb_pending); + if (!base->first) continue; + + if (base->get_softirq_time) + base->softirq_time = base->get_softirq_time(); + else if (gettime) { + hrtimer_get_softirq_time(cpu_base); + gettime = 0; } - __run_hrtimer(timer); - } - spin_unlock(&cpu_base->lock); -} + spin_lock(&cpu_base->lock); -void hrtimer_run_queues(void) -{ - struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - int i; + while ((node = base->first)) { + struct hrtimer *timer; - if (hrtimer_hres_active()) - return; + timer = rb_entry(node, struct hrtimer, node); + if (base->softirq_time.tv64 <= timer->expires.tv64) + break; - hrtimer_get_softirq_time(cpu_base); + if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { + __remove_hrtimer(timer, base, + HRTIMER_STATE_PENDING, 0); + list_add_tail(&timer->cb_entry, + &base->cpu_base->cb_pending); + continue; + } - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - run_hrtimer_queue(cpu_base, i); + __run_hrtimer(timer); + } + spin_unlock(&cpu_base->lock); + } } /* @@ -1353,22 +1494,27 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; struct timespec __user *rmtp; + int ret = 0; - hrtimer_init(&t.timer, restart->arg0, HRTIMER_MODE_ABS); - t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2; + hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, + HRTIMER_MODE_ABS); + t.timer.expires.tv64 = restart->nanosleep.expires; if (do_nanosleep(&t, HRTIMER_MODE_ABS)) - return 0; + goto out; - rmtp = (struct timespec __user *)restart->arg1; + rmtp = restart->nanosleep.rmtp; if (rmtp) { - int ret = update_rmtp(&t.timer, rmtp); + ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) - return ret; + goto out; } /* The other values in restart are already filled in */ - return -ERESTART_RESTARTBLOCK; + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; } long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, @@ -1376,30 +1522,35 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, { struct restart_block *restart; struct hrtimer_sleeper t; + int ret = 0; - hrtimer_init(&t.timer, clockid, mode); + hrtimer_init_on_stack(&t.timer, clockid, mode); t.timer.expires = timespec_to_ktime(*rqtp); if (do_nanosleep(&t, mode)) - return 0; + goto out; /* Absolute timers do not update the rmtp value and restart: */ - if (mode == HRTIMER_MODE_ABS) - return -ERESTARTNOHAND; + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } if (rmtp) { - int ret = update_rmtp(&t.timer, rmtp); + ret = update_rmtp(&t.timer, rmtp); if (ret <= 0) - return ret; + goto out; } restart = ¤t_thread_info()->restart_block; restart->fn = hrtimer_nanosleep_restart; - restart->arg0 = (unsigned long) t.timer.base->index; - restart->arg1 = (unsigned long) rmtp; - restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF; - restart->arg3 = t.timer.expires.tv64 >> 32; + restart->nanosleep.index = t.timer.base->index; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = t.timer.expires.tv64; - return -ERESTART_RESTARTBLOCK; + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; } asmlinkage long @@ -1425,7 +1576,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) int i; spin_lock_init(&cpu_base->lock); - lockdep_set_class(&cpu_base->lock, &cpu_base->lock_key); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) cpu_base->clock_base[i].cpu_base = cpu_base; @@ -1445,6 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); + debug_hrtimer_deactivate(timer); __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); timer->base = new_base; /* @@ -1466,16 +1617,16 @@ static void migrate_hrtimers(int cpu) tick_cancel_sched_timer(cpu); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fdb3fbe2b0c..964964baefa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -47,7 +47,7 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - desc->affinity = CPU_MASK_ALL; + cpus_setall(desc->affinity); #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 6d9204f3a37..38a25b8d8bf 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -1,6 +1,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/device.h> +#include <linux/gfp.h> /* * Device resource management aware IRQ request/free implementation. diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 438a0146428..46d6611a33b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -11,6 +11,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> +#include <linux/slab.h> #include "internals.h" @@ -149,6 +150,26 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); +static void __enable_irq(struct irq_desc *desc, unsigned int irq) +{ + switch (desc->depth) { + case 0: + printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); + WARN_ON(1); + break; + case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + + /* Prevent probing on this irq: */ + desc->status = status | IRQ_NOPROBE; + check_irq_resend(desc, irq); + /* fall-through */ + } + default: + desc->depth--; + } +} + /** * enable_irq - enable handling of an irq * @irq: Interrupt to enable @@ -168,22 +189,7 @@ void enable_irq(unsigned int irq) return; spin_lock_irqsave(&desc->lock, flags); - switch (desc->depth) { - case 0: - printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - WARN_ON(1); - break; - case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; - check_irq_resend(desc, irq); - /* fall-through */ - } - default: - desc->depth--; - } + __enable_irq(desc, irq); spin_unlock_irqrestore(&desc->lock, flags); } EXPORT_SYMBOL(enable_irq); @@ -364,7 +370,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) compat_irq_chip_set_default_handler(desc); desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | - IRQ_INPROGRESS); + IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; @@ -380,6 +386,16 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; + + /* + * Check whether we disabled the irq via the spurious handler + * before. Reenable it and give it another chance. + */ + if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { + desc->status &= ~IRQ_SPURIOUS_DISABLED; + __enable_irq(desc, irq); + } + spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 088dabbf2d6..c66d3f10e85 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -209,8 +209,8 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED; - desc->depth = 1; + desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->depth++; desc->chip->disable(irq); } desc->irqs_unhandled = 0; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f091d13def0..6fc0040f3e3 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -472,11 +472,7 @@ static const struct file_operations kallsyms_operations = { static int __init kallsyms_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("kallsyms", 0444, NULL); - if (entry) - entry->proc_fops = &kallsyms_operations; + proc_create("kallsyms", 0444, NULL, &kallsyms_operations); return 0; } __initcall(kallsyms_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 06a0e277565..1c5fcacbcf3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -29,7 +29,6 @@ #include <asm/uaccess.h> #include <asm/io.h> #include <asm/system.h> -#include <asm/semaphore.h> #include <asm/sections.h> /* Per cpu memory for storing cpu states in case of system crash. */ @@ -1218,7 +1217,7 @@ static int __init parse_crashkernel_mem(char *cmdline, } /* match ? */ - if (system_ram >= start && system_ram <= end) { + if (system_ram >= start && system_ram < end) { *crash_size = size; break; } @@ -1406,6 +1405,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); arch_crash_save_vmcoreinfo(); diff --git a/kernel/kgdb.c b/kernel/kgdb.c new file mode 100644 index 00000000000..3ec23c3ec97 --- /dev/null +++ b/kernel/kgdb.c @@ -0,0 +1,1689 @@ +/* + * KGDB stub. + * + * Maintainer: Jason Wessel <jason.wessel@windriver.com> + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2008 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger <george@mvista.com> + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe <dave@gcom.com>, + * Tigran Aivazian <tigran@sco.com> + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#include <linux/pid_namespace.h> +#include <linux/clocksource.h> +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/console.h> +#include <linux/threads.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/reboot.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/sysrq.h> +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/pid.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/byteorder.h> +#include <asm/atomic.h> +#include <asm/system.h> +#include <asm/unaligned.h> + +static int kgdb_break_asap; + +struct kgdb_state { + int ex_vector; + int signo; + int err_code; + int cpu; + int pass_exception; + unsigned long threadid; + long kgdb_usethreadid; + struct pt_regs *linux_regs; +}; + +static struct debuggerinfo_struct { + void *debuggerinfo; + struct task_struct *task; +} kgdb_info[NR_CPUS]; + +/** + * kgdb_connected - Is a host GDB connected to us? + */ +int kgdb_connected; +EXPORT_SYMBOL_GPL(kgdb_connected); + +/* All the KGDB handlers are installed */ +static int kgdb_io_module_registered; + +/* Guard for recursive entry */ +static int exception_level; + +static struct kgdb_io *kgdb_io_ops; +static DEFINE_SPINLOCK(kgdb_registration_lock); + +/* kgdb console driver is loaded */ +static int kgdb_con_registered; +/* determine if kgdb console output should be used */ +static int kgdb_use_con; + +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + +module_param(kgdb_use_con, int, 0644); + +/* + * Holds information about breakpoints in a kernel. These breakpoints are + * added and removed by gdb. + */ +static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { + [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } +}; + +/* + * The CPU# of the active CPU, or -1 if none: + */ +atomic_t kgdb_active = ATOMIC_INIT(-1); + +/* + * We use NR_CPUs not PERCPU, in case kgdb is used to debug early + * bootup code (which might not have percpu set up yet): + */ +static atomic_t passive_cpu_wait[NR_CPUS]; +static atomic_t cpu_in_kgdb[NR_CPUS]; +atomic_t kgdb_setting_breakpoint; + +struct task_struct *kgdb_usethread; +struct task_struct *kgdb_contthread; + +int kgdb_single_step; + +/* Our I/O buffers. */ +static char remcom_in_buffer[BUFMAX]; +static char remcom_out_buffer[BUFMAX]; + +/* Storage for the registers, in GDB format. */ +static unsigned long gdb_regs[(NUMREGBYTES + + sizeof(unsigned long) - 1) / + sizeof(unsigned long)]; + +/* to keep track of the CPU which is doing the single stepping*/ +atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); + +/* + * If you are debugging a problem where roundup (the collection of + * all other CPUs) is a problem [this should be extremely rare], + * then use the nokgdbroundup option to avoid roundup. In that case + * the other CPUs might interfere with your debugging context, so + * use this with care: + */ +static int kgdb_do_roundup = 1; + +static int __init opt_nokgdbroundup(char *str) +{ + kgdb_do_roundup = 0; + + return 0; +} + +early_param("nokgdbroundup", opt_nokgdbroundup); + +/* + * Finally, some KGDB code :-) + */ + +/* + * Weak aliases for breakpoint management, + * can be overriden by architectures when needed: + */ +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + + return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) +{ + int err; + + err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE); + if (err) + return err; + + return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); +} + +int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) +{ + return probe_kernel_write((char *)addr, + (char *)bundle, BREAK_INSTR_SIZE); +} + +unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int __weak kgdb_arch_init(void) +{ + return 0; +} + +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +void __weak +kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) +{ + return; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +void __weak kgdb_disable_hw_debug(struct pt_regs *regs) +{ +} + +/* + * GDB remote protocol parser: + */ + +static int hex(char ch) +{ + if ((ch >= 'a') && (ch <= 'f')) + return ch - 'a' + 10; + if ((ch >= '0') && (ch <= '9')) + return ch - '0'; + if ((ch >= 'A') && (ch <= 'F')) + return ch - 'A' + 10; + return -1; +} + +/* scan for the sequence $<data>#<checksum> */ +static void get_packet(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int count; + char ch; + + do { + /* + * Spin and wait around for the start character, ignore all + * other characters: + */ + while ((ch = (kgdb_io_ops->read_char())) != '$') + /* nothing */; + + kgdb_connected = 1; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* + * now, read until a # or end of buffer is found: + */ + while (count < (BUFMAX - 1)) { + ch = kgdb_io_ops->read_char(); + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex(kgdb_io_ops->read_char()) << 4; + xmitcsum += hex(kgdb_io_ops->read_char()); + + if (checksum != xmitcsum) + /* failed checksum */ + kgdb_io_ops->write_char('-'); + else + /* successful transfer */ + kgdb_io_ops->write_char('+'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + } + } while (checksum != xmitcsum); +} + +/* + * Send the packet in buffer. + * Check for gdb connection if asked for. + */ +static void put_packet(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* + * $<packet info>#<checksum>. + */ + while (1) { + kgdb_io_ops->write_char('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + kgdb_io_ops->write_char(ch); + checksum += ch; + count++; + } + + kgdb_io_ops->write_char('#'); + kgdb_io_ops->write_char(hex_asc_hi(checksum)); + kgdb_io_ops->write_char(hex_asc_lo(checksum)); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + + /* Now see what we get in reply. */ + ch = kgdb_io_ops->read_char(); + + if (ch == 3) + ch = kgdb_io_ops->read_char(); + + /* If we get an ACK, we are done. */ + if (ch == '+') + return; + + /* + * If we get the start of another packet, this means + * that GDB is attempting to reconnect. We will NAK + * the packet being sent, and stop trying to send this + * packet. + */ + if (ch == '$') { + kgdb_io_ops->write_char('-'); + if (kgdb_io_ops->flush) + kgdb_io_ops->flush(); + return; + } + } +} + +/* + * Convert the memory pointed to by mem into hex, placing result in buf. + * Return a pointer to the last char put in buf (null). May return an error. + */ +int kgdb_mem2hex(char *mem, char *buf, int count) +{ + char *tmp; + int err; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory copy. Hex conversion will work against this one. + */ + tmp = buf + count; + + err = probe_kernel_read(tmp, mem, count); + if (!err) { + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; + } + + *buf = 0; + } + + return err; +} + +/* + * Copy the binary array pointed to by buf into mem. Fix $, #, and + * 0x7d escaped with 0x7d. Return a pointer to the character after + * the last byte written. + */ +static int kgdb_ebin2mem(char *buf, char *mem, int count) +{ + int err = 0; + char c; + + while (count-- > 0) { + c = *buf++; + if (c == 0x7d) + c = *buf++ ^ 0x20; + + err = probe_kernel_write(mem, &c, 1); + if (err) + break; + + mem++; + } + + return err; +} + +/* + * Convert the hex array pointed to by buf into binary to be placed in mem. + * Return a pointer to the character AFTER the last byte written. + * May return an error. + */ +int kgdb_hex2mem(char *buf, char *mem, int count) +{ + char *tmp_raw; + char *tmp_hex; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory that is converted from hex. + */ + tmp_raw = buf + count * 2; + + tmp_hex = tmp_raw - 1; + while (tmp_hex >= buf) { + tmp_raw--; + *tmp_raw = hex(*tmp_hex--); + *tmp_raw |= hex(*tmp_hex--) << 4; + } + + return probe_kernel_write(mem, tmp_raw, count); +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int kgdb_hex2long(char **ptr, unsigned long *long_val) +{ + int hex_val; + int num = 0; + + *long_val = 0; + + while (**ptr) { + hex_val = hex(**ptr); + if (hex_val < 0) + break; + + *long_val = (*long_val << 4) | hex_val; + num++; + (*ptr)++; + } + + return num; +} + +/* Write memory due to an 'M' or 'X' packet. */ +static int write_mem_msg(int binary) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + unsigned long length; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && + kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { + if (binary) + err = kgdb_ebin2mem(ptr, (char *)addr, length); + else + err = kgdb_hex2mem(ptr, (char *)addr, length); + if (err) + return err; + if (CACHE_FLUSH_IS_SAFE) + flush_icache_range(addr, addr + length + 1); + return 0; + } + + return -EINVAL; +} + +static void error_packet(char *pkt, int error) +{ + error = -error; + pkt[0] = 'E'; + pkt[1] = hex_asc[(error / 10)]; + pkt[2] = hex_asc[(error % 10)]; + pkt[3] = '\0'; +} + +/* + * Thread ID accessors. We represent a flat TID space to GDB, where + * the per CPU idle threads (which under Linux all have PID 0) are + * remapped to negative TIDs. + */ + +#define BUF_THREAD_ID_SIZE 16 + +static char *pack_threadid(char *pkt, unsigned char *id) +{ + char *limit; + + limit = pkt + BUF_THREAD_ID_SIZE; + while (pkt < limit) + pkt = pack_hex_byte(pkt, *id++); + + return pkt; +} + +static void int_to_threadref(unsigned char *id, int value) +{ + unsigned char *scan; + int i = 4; + + scan = (unsigned char *)id; + while (i--) + *scan++ = 0; + put_unaligned_be32(value, scan); +} + +static struct task_struct *getthread(struct pt_regs *regs, int tid) +{ + /* + * Non-positive TIDs are remapped idle tasks: + */ + if (tid <= 0) + return idle_task(-tid); + + /* + * find_task_by_pid_ns() does not take the tasklist lock anymore + * but is nicely RCU locked - hence is a pretty resilient + * thing to use: + */ + return find_task_by_pid_ns(tid, &init_pid_ns); +} + +/* + * CPU debug state control: + */ + +#ifdef CONFIG_SMP +static void kgdb_wait(struct pt_regs *regs) +{ + unsigned long flags; + int cpu; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + /* + * Make sure the above info reaches the primary CPU before + * our cpu_in_kgdb[] flag setting does: + */ + smp_wmb(); + atomic_set(&cpu_in_kgdb[cpu], 1); + + /* Wait till primary CPU is done with debugging */ + while (atomic_read(&passive_cpu_wait[cpu])) + cpu_relax(); + + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; + + /* fix up hardware debug registers on local cpu */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + + /* Signal the primary CPU that we are done: */ + atomic_set(&cpu_in_kgdb[cpu], 0); + clocksource_touch_watchdog(); + local_irq_restore(flags); +} +#endif + +/* + * Some architectures need cache flushes when we set/clear a + * breakpoint: + */ +static void kgdb_flush_swbreak_addr(unsigned long addr) +{ + if (!CACHE_FLUSH_IS_SAFE) + return; + + if (current->mm && current->mm->mmap_cache) { + flush_cache_range(current->mm->mmap_cache, + addr, addr + BREAK_INSTR_SIZE); + } + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); +} + +/* + * SW breakpoint management: + */ +static int kgdb_activate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_SET) + continue; + + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_set_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_ACTIVE; + } + return 0; +} + +static int kgdb_set_sw_break(unsigned long addr) +{ + int err = kgdb_validate_break_address(addr); + int breakno = -1; + int i; + + if (err) + return err; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) + return -EEXIST; + } + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_REMOVED && + kgdb_break[i].bpt_addr == addr) { + breakno = i; + break; + } + } + + if (breakno == -1) { + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_UNDEFINED) { + breakno = i; + break; + } + } + } + + if (breakno == -1) + return -E2BIG; + + kgdb_break[breakno].state = BP_SET; + kgdb_break[breakno].type = BP_BREAKPOINT; + kgdb_break[breakno].bpt_addr = addr; + + return 0; +} + +static int kgdb_deactivate_sw_breakpoints(void) +{ + unsigned long addr; + int error = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + continue; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + return error; + + kgdb_flush_swbreak_addr(addr); + kgdb_break[i].state = BP_SET; + } + return 0; +} + +static int kgdb_remove_sw_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) { + kgdb_break[i].state = BP_REMOVED; + return 0; + } + } + return -ENOENT; +} + +int kgdb_isremovedbreak(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_REMOVED) && + (kgdb_break[i].bpt_addr == addr)) + return 1; + } + return 0; +} + +static int remove_all_break(void) +{ + unsigned long addr; + int error; + int i; + + /* Clear memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + goto setundefined; + addr = kgdb_break[i].bpt_addr; + error = kgdb_arch_remove_breakpoint(addr, + kgdb_break[i].saved_instr); + if (error) + printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + addr); +setundefined: + kgdb_break[i].state = BP_UNDEFINED; + } + + /* Clear hardware breakpoints. */ + if (arch_kgdb_ops.remove_all_hw_break) + arch_kgdb_ops.remove_all_hw_break(); + + return 0; +} + +/* + * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + */ +static inline int shadow_pid(int realpid) +{ + if (realpid) + return realpid; + + return -1-raw_smp_processor_id(); +} + +static char gdbmsgbuf[BUFMAX + 1]; + +static void kgdb_msg_write(const char *s, int len) +{ + char *bufptr; + int wcount; + int i; + + /* 'O'utput */ + gdbmsgbuf[0] = 'O'; + + /* Fill and send buffers... */ + while (len > 0) { + bufptr = gdbmsgbuf + 1; + + /* Calculate how many this time */ + if ((len << 1) > (BUFMAX - 2)) + wcount = (BUFMAX - 2) >> 1; + else + wcount = len; + + /* Pack in hex chars */ + for (i = 0; i < wcount; i++) + bufptr = pack_hex_byte(bufptr, s[i]); + *bufptr = '\0'; + + /* Move up */ + s += wcount; + len -= wcount; + + /* Write packet */ + put_packet(gdbmsgbuf); + } +} + +/* + * Return true if there is a valid kgdb I/O module. Also if no + * debugger is attached a message can be printed to the console about + * waiting for the debugger to attach. + * + * The print_wait argument is only to be true when called from inside + * the core kgdb_handle_exception, because it will wait for the + * debugger to attach. + */ +static int kgdb_io_ready(int print_wait) +{ + if (!kgdb_io_ops) + return 0; + if (kgdb_connected) + return 1; + if (atomic_read(&kgdb_setting_breakpoint)) + return 1; + if (print_wait) + printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); + return 1; +} + +/* + * All the functions that start with gdb_cmd are the various + * operations to implement the handlers for the gdbserial protocol + * where KGDB is communicating with an external debugger + */ + +/* Handle the '?' status packets */ +static void gdb_cmd_status(struct kgdb_state *ks) +{ + /* + * We know that this packet is only sent + * during initial connect. So to be safe, + * we clear out our breakpoints now in case + * GDB is reconnecting. + */ + remove_all_break(); + + remcom_out_buffer[0] = 'S'; + pack_hex_byte(&remcom_out_buffer[1], ks->signo); +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + struct task_struct *thread; + void *local_debuggerinfo; + int i; + + thread = kgdb_usethread; + if (!thread) { + thread = kgdb_info[ks->cpu].task; + local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; + } else { + local_debuggerinfo = NULL; + for (i = 0; i < NR_CPUS; i++) { + /* + * Try to find the task on some other + * or possibly this node if we do not + * find the matching task then we try + * to approximate the results. + */ + if (thread == kgdb_info[i].task) + local_debuggerinfo = kgdb_info[i].debuggerinfo; + } + } + + /* + * All threads that don't have debuggerinfo should be + * in __schedule() sleeping, since all other CPUs + * are in kgdb_wait, and thus have debuggerinfo. + */ + if (local_debuggerinfo) { + pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); + } else { + /* + * Pull stuff saved during switch_to; nothing + * else is accessible (or even particularly + * relevant). + * + * This should be enough for a stack trace. + */ + sleeping_thread_to_gdb_regs(gdb_regs, thread); + } + kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); +} + +/* Handle the 'G' set registers request */ +static void gdb_cmd_setregs(struct kgdb_state *ks) +{ + kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); + + if (kgdb_usethread && kgdb_usethread != current) { + error_packet(remcom_out_buffer, -EINVAL); + } else { + gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); + } +} + +/* Handle the 'm' memory read bytes */ +static void gdb_cmd_memread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long length; + unsigned long addr; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && + kgdb_hex2long(&ptr, &length) > 0) { + err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); + if (err) + error_packet(remcom_out_buffer, err); + } else { + error_packet(remcom_out_buffer, -EINVAL); + } +} + +/* Handle the 'M' memory write bytes */ +static void gdb_cmd_memwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(0); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'X' memory binary write bytes */ +static void gdb_cmd_binwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(1); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'D' or 'k', detach or kill packets */ +static void gdb_cmd_detachkill(struct kgdb_state *ks) +{ + int error; + + /* The detach case */ + if (remcom_in_buffer[0] == 'D') { + error = remove_all_break(); + if (error < 0) { + error_packet(remcom_out_buffer, error); + } else { + strcpy(remcom_out_buffer, "OK"); + kgdb_connected = 0; + } + put_packet(remcom_out_buffer); + } else { + /* + * Assume the kill case, with no exit code checking, + * trying to force detach the debugger: + */ + remove_all_break(); + kgdb_connected = 0; + } +} + +/* Handle the 'R' reboot packets */ +static int gdb_cmd_reboot(struct kgdb_state *ks) +{ + /* For now, only honor R0 */ + if (strcmp(remcom_in_buffer, "R0") == 0) { + printk(KERN_CRIT "Executing emergency reboot\n"); + strcpy(remcom_out_buffer, "OK"); + put_packet(remcom_out_buffer); + + /* + * Execution should not return from + * machine_emergency_restart() + */ + machine_emergency_restart(); + kgdb_connected = 0; + + return 1; + } + return 0; +} + +/* Handle the 'q' query packets */ +static void gdb_cmd_query(struct kgdb_state *ks) +{ + struct task_struct *thread; + unsigned char thref[8]; + char *ptr; + int i; + + switch (remcom_in_buffer[1]) { + case 's': + case 'f': + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + + if (remcom_in_buffer[1] == 'f') + ks->threadid = 1; + + remcom_out_buffer[0] = 'm'; + ptr = remcom_out_buffer + 1; + + for (i = 0; i < 17; ks->threadid++) { + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) { + int_to_threadref(thref, ks->threadid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + i++; + } + } + *(--ptr) = '\0'; + break; + + case 'C': + /* Current thread id */ + strcpy(remcom_out_buffer, "QC"); + ks->threadid = shadow_pid(current->pid); + int_to_threadref(thref, ks->threadid); + pack_threadid(remcom_out_buffer + 2, thref); + break; + case 'T': + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + ks->threadid = 0; + ptr = remcom_in_buffer + 17; + kgdb_hex2long(&ptr, &ks->threadid); + if (!getthread(ks->linux_regs, ks->threadid)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + if (ks->threadid > 0) { + kgdb_mem2hex(getthread(ks->linux_regs, + ks->threadid)->comm, + remcom_out_buffer, 16); + } else { + static char tmpstr[23 + BUF_THREAD_ID_SIZE]; + + sprintf(tmpstr, "Shadow task %d for pid 0", + (int)(-ks->threadid-1)); + kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); + } + break; + } +} + +/* Handle the 'H' task query packets */ +static void gdb_cmd_task(struct kgdb_state *ks) +{ + struct task_struct *thread; + char *ptr; + + switch (remcom_in_buffer[1]) { + case 'g': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_usethread = thread; + ks->kgdb_usethreadid = ks->threadid; + strcpy(remcom_out_buffer, "OK"); + break; + case 'c': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + if (!ks->threadid) { + kgdb_contthread = NULL; + } else { + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_contthread = thread; + } + strcpy(remcom_out_buffer, "OK"); + break; + } +} + +/* Handle the 'T' thread query packets */ +static void gdb_cmd_thread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + struct task_struct *thread; + + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, -EINVAL); +} + +/* Handle the 'z' or 'Z' breakpoint remove or set packets */ +static void gdb_cmd_break(struct kgdb_state *ks) +{ + /* + * Since GDB-5.3, it's been drafted that '0' is a software + * breakpoint, '1' is a hardware breakpoint, so let's do that. + */ + char *bpt_type = &remcom_in_buffer[1]; + char *ptr = &remcom_in_buffer[2]; + unsigned long addr; + unsigned long length; + int error = 0; + + if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { + /* Unsupported */ + if (*bpt_type > '4') + return; + } else { + if (*bpt_type != '0' && *bpt_type != '1') + /* Unsupported. */ + return; + } + + /* + * Test if this is a hardware breakpoint, and + * if we support it: + */ + if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) + /* Unsupported. */ + return; + + if (*(ptr++) != ',') { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (!kgdb_hex2long(&ptr, &addr)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (*(ptr++) != ',' || + !kgdb_hex2long(&ptr, &length)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + + if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') + error = kgdb_set_sw_break(addr); + else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') + error = kgdb_remove_sw_break(addr); + else if (remcom_in_buffer[0] == 'Z') + error = arch_kgdb_ops.set_hw_breakpoint(addr, + (int)length, *bpt_type - '0'); + else if (remcom_in_buffer[0] == 'z') + error = arch_kgdb_ops.remove_hw_breakpoint(addr, + (int) length, *bpt_type - '0'); + + if (error == 0) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, error); +} + +/* Handle the 'C' signal / exception passing packets */ +static int gdb_cmd_exception_pass(struct kgdb_state *ks) +{ + /* C09 == pass exception + * C15 == detach kgdb, pass exception + */ + if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'c'; + + } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'D'; + remove_all_break(); + kgdb_connected = 0; + return 1; + + } else { + error_packet(remcom_out_buffer, -EINVAL); + return 0; + } + + /* Indicate fall through */ + return -1; +} + +/* + * This function performs all gdbserial command procesing + */ +static int gdb_serial_stub(struct kgdb_state *ks) +{ + int error = 0; + int tmp; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + if (kgdb_connected) { + unsigned char thref[8]; + char *ptr; + + /* Reply to host that an exception has occurred */ + ptr = remcom_out_buffer; + *ptr++ = 'T'; + ptr = pack_hex_byte(ptr, ks->signo); + ptr += strlen(strcpy(ptr, "thread:")); + int_to_threadref(thref, shadow_pid(current->pid)); + ptr = pack_threadid(ptr, thref); + *ptr++ = ';'; + put_packet(remcom_out_buffer); + } + + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; + + while (1) { + error = 0; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + get_packet(remcom_in_buffer); + + switch (remcom_in_buffer[0]) { + case '?': /* gdbserial status */ + gdb_cmd_status(ks); + break; + case 'g': /* return the value of the CPU registers */ + gdb_cmd_getregs(ks); + break; + case 'G': /* set the value of the CPU registers - return OK */ + gdb_cmd_setregs(ks); + break; + case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + gdb_cmd_memread(ks); + break; + case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_memwrite(ks); + break; + case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_binwrite(ks); + break; + /* kill or detach. KGDB should treat this like a + * continue. + */ + case 'D': /* Debugger detach */ + case 'k': /* Debugger detach via kill */ + gdb_cmd_detachkill(ks); + goto default_handle; + case 'R': /* Reboot */ + if (gdb_cmd_reboot(ks)) + goto default_handle; + break; + case 'q': /* query command */ + gdb_cmd_query(ks); + break; + case 'H': /* task related */ + gdb_cmd_task(ks); + break; + case 'T': /* Query thread status */ + gdb_cmd_thread(ks); + break; + case 'z': /* Break point remove */ + case 'Z': /* Break point set */ + gdb_cmd_break(ks); + break; + case 'C': /* Exception passing */ + tmp = gdb_cmd_exception_pass(ks); + if (tmp > 0) + goto default_handle; + if (tmp == 0) + break; + /* Fall through on tmp < 0 */ + case 'c': /* Continue packet */ + case 's': /* Single step packet */ + if (kgdb_contthread && kgdb_contthread != current) { + /* Can't switch threads in kgdb */ + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_activate_sw_breakpoints(); + /* Fall through to default processing */ + default: +default_handle: + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + /* + * Leave cmd processing on error, detach, + * kill, continue, or single step. + */ + if (error >= 0 || remcom_in_buffer[0] == 'D' || + remcom_in_buffer[0] == 'k') { + error = 0; + goto kgdb_exit; + } + + } + + /* reply to the request */ + put_packet(remcom_out_buffer); + } + +kgdb_exit: + if (ks->pass_exception) + error = 1; + return error; +} + +static int kgdb_reenter_check(struct kgdb_state *ks) +{ + unsigned long addr; + + if (atomic_read(&kgdb_active) != raw_smp_processor_id()) + return 0; + + /* Panic on recursive debugger calls: */ + exception_level++; + addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + kgdb_deactivate_sw_breakpoints(); + + /* + * If the break point removed ok at the place exception + * occurred, try to recover and print a warning to the end + * user because the user planted a breakpoint in a place that + * KGDB needs in order to function. + */ + if (kgdb_remove_sw_break(addr) == 0) { + exception_level = 0; + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + kgdb_activate_sw_breakpoints(); + printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", + addr); + WARN_ON_ONCE(1); + + return 1; + } + remove_all_break(); + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + + if (exception_level > 1) { + dump_stack(); + panic("Recursive entry to debugger"); + } + + printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); + dump_stack(); + panic("Recursive entry to debugger"); + + return 1; +} + +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + unsigned long flags; + int error = 0; + int i, cpu; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->ex_vector = evector; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + +acquirelock: + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. + */ + local_irq_save(flags); + + cpu = raw_smp_processor_id(); + + /* + * Acquire the kgdb_active lock: + */ + while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1) + cpu_relax(); + + /* + * Do not start the debugger connection on this CPU if the last + * instance of the exception handler wanted to come into the + * debugger on a different CPU via a single step + */ + if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && + atomic_read(&kgdb_cpu_doing_single_step) != cpu) { + + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + goto acquirelock; + } + + if (!kgdb_io_ready(1)) { + error = 1; + goto kgdb_restore; /* No I/O connection, so resume the system */ + } + + /* + * Don't enter if we have hit a removed breakpoint. + */ + if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) + goto kgdb_restore; + + /* Call the I/O driver's pre_exception routine */ + if (kgdb_io_ops->pre_exception) + kgdb_io_ops->pre_exception(); + + kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs; + kgdb_info[ks->cpu].task = current; + + kgdb_disable_hw_debug(ks->linux_regs); + + /* + * Get the passive CPU lock which will hold all the non-primary + * CPU in a spin state while the debugger is active + */ + if (!kgdb_single_step || !kgdb_contthread) { + for (i = 0; i < NR_CPUS; i++) + atomic_set(&passive_cpu_wait[i], 1); + } + + /* + * spin_lock code is good enough as a barrier so we don't + * need one here: + */ + atomic_set(&cpu_in_kgdb[ks->cpu], 1); + +#ifdef CONFIG_SMP + /* Signal the other CPUs to enter kgdb_wait() */ + if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + kgdb_roundup_cpus(flags); +#endif + + /* + * Wait for the other CPUs to be notified and be waiting for us: + */ + for_each_online_cpu(i) { + while (!atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + + /* + * At this point the primary processor is completely + * in the debugger and all secondary CPUs are quiescent + */ + kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); + kgdb_deactivate_sw_breakpoints(); + kgdb_single_step = 0; + kgdb_contthread = NULL; + exception_level = 0; + + /* Talk to debugger with gdbserial protocol */ + error = gdb_serial_stub(ks); + + /* Call the I/O driver's post_exception routine */ + if (kgdb_io_ops->post_exception) + kgdb_io_ops->post_exception(); + + kgdb_info[ks->cpu].debuggerinfo = NULL; + kgdb_info[ks->cpu].task = NULL; + atomic_set(&cpu_in_kgdb[ks->cpu], 0); + + if (!kgdb_single_step || !kgdb_contthread) { + for (i = NR_CPUS-1; i >= 0; i--) + atomic_set(&passive_cpu_wait[i], 0); + /* + * Wait till all the CPUs have quit + * from the debugger. + */ + for_each_online_cpu(i) { + while (atomic_read(&cpu_in_kgdb[i])) + cpu_relax(); + } + } + +kgdb_restore: + /* Free kgdb_active */ + atomic_set(&kgdb_active, -1); + clocksource_touch_watchdog(); + local_irq_restore(flags); + + return error; +} + +int kgdb_nmicallback(int cpu, void *regs) +{ +#ifdef CONFIG_SMP + if (!atomic_read(&cpu_in_kgdb[cpu]) && + atomic_read(&kgdb_active) != cpu && + atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) { + kgdb_wait((struct pt_regs *)regs); + return 0; + } +#endif + return 1; +} + +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) +{ + unsigned long flags; + + /* If we're debugging, or KGDB has not connected, don't try + * and print. */ + if (!kgdb_connected || atomic_read(&kgdb_active) != -1) + return; + + local_irq_save(flags); + kgdb_msg_write(s, count); + local_irq_restore(flags); +} + +static struct console kgdbcons = { + .name = "kgdb", + .write = kgdb_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_gdb(int key, struct tty_struct *tty) +{ + if (!kgdb_io_ops) { + printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + return; + } + if (!kgdb_connected) + printk(KERN_CRIT "Entering KGDB\n"); + + kgdb_breakpoint(); +} + +static struct sysrq_key_op sysrq_gdb_op = { + .handler = sysrq_handle_gdb, + .help_msg = "Gdb", + .action_msg = "GDB", +}; +#endif + +static void kgdb_register_callbacks(void) +{ + if (!kgdb_io_module_registered) { + kgdb_io_module_registered = 1; + kgdb_arch_init(); +#ifdef CONFIG_MAGIC_SYSRQ + register_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_use_con && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + } +} + +static void kgdb_unregister_callbacks(void) +{ + /* + * When this routine is called KGDB should unregister from the + * panic handler and clean up, making sure it is not handling any + * break exceptions at the time. + */ + if (kgdb_io_module_registered) { + kgdb_io_module_registered = 0; + kgdb_arch_exit(); +#ifdef CONFIG_MAGIC_SYSRQ + unregister_sysrq_key('g', &sysrq_gdb_op); +#endif + if (kgdb_con_registered) { + unregister_console(&kgdbcons); + kgdb_con_registered = 0; + } + } +} + +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + +/** + * kgdb_register_io_module - register KGDB IO module + * @new_kgdb_io_ops: the io ops vector + * + * Register it with the KGDB core. + */ +int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops) +{ + int err; + + spin_lock(&kgdb_registration_lock); + + if (kgdb_io_ops) { + spin_unlock(&kgdb_registration_lock); + + printk(KERN_ERR "kgdb: Another I/O driver is already " + "registered with KGDB.\n"); + return -EBUSY; + } + + if (new_kgdb_io_ops->init) { + err = new_kgdb_io_ops->init(); + if (err) { + spin_unlock(&kgdb_registration_lock); + return err; + } + } + + kgdb_io_ops = new_kgdb_io_ops; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", + new_kgdb_io_ops->name); + + /* Arm KGDB now. */ + kgdb_register_callbacks(); + + if (kgdb_break_asap) + kgdb_initial_breakpoint(); + + return 0; +} +EXPORT_SYMBOL_GPL(kgdb_register_io_module); + +/** + * kkgdb_unregister_io_module - unregister KGDB IO module + * @old_kgdb_io_ops: the io ops vector + * + * Unregister it with the KGDB core. + */ +void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops) +{ + BUG_ON(kgdb_connected); + + /* + * KGDB is no longer able to communicate out, so + * unregister our callbacks and reset state. + */ + kgdb_unregister_callbacks(); + + spin_lock(&kgdb_registration_lock); + + WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops); + kgdb_io_ops = NULL; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO + "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + old_kgdb_io_ops->name); +} +EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); + +/** + * kgdb_breakpoint - generate breakpoint exception + * + * This function will generate a breakpoint exception. It is used at the + * beginning of a program to sync up with a debugger and can be used + * otherwise as a quick means to stop program execution and "break" into + * the debugger. + */ +void kgdb_breakpoint(void) +{ + atomic_set(&kgdb_setting_breakpoint, 1); + wmb(); /* Sync point before breakpoint */ + arch_kgdb_breakpoint(); + wmb(); /* Sync point after breakpoint */ + atomic_set(&kgdb_setting_breakpoint, 0); +} +EXPORT_SYMBOL_GPL(kgdb_breakpoint); + +static int __init opt_kgdb_wait(char *str) +{ + kgdb_break_asap = 1; + + if (kgdb_io_module_registered) + kgdb_initial_breakpoint(); + + return 0; +} + +early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/kmod.c b/kernel/kmod.c index 22be3ff3f36..8df97d3dfda 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -27,6 +27,7 @@ #include <linux/mnt_namespace.h> #include <linux/completion.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> @@ -165,7 +166,7 @@ static int ____call_usermodehelper(void *data) } /* We can run anywhere, unlike our parent keventd(). */ - set_cpus_allowed(current, CPU_MASK_ALL); + set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); /* * Our parent is keventd, which runs with elevated scheduling priority. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fcfb580c3af..d4998f81e22 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,6 +72,18 @@ DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +/* + * Normally, functions that we'd want to prohibit kprobes in, are marked + * __kprobes. But, there are cases where such functions already belong to + * a different section (__sched for preempt_schedule) + * + * For such cases, we now have a blacklist + */ +struct kprobe_blackpoint kprobe_blacklist[] = { + {"preempt_schedule",}, + {NULL} /* Terminator */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -417,6 +429,21 @@ static inline void free_rp_inst(struct kretprobe *rp) } } +static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +{ + unsigned long flags; + struct kretprobe_instance *ri; + struct hlist_node *pos, *next; + /* No race here */ + spin_lock_irqsave(&kretprobe_lock, flags); + hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { + ri->rp = NULL; + hlist_del(&ri->uflist); + } + spin_unlock_irqrestore(&kretprobe_lock, flags); + free_rp_inst(rp); +} + /* * Keep all fields in the kprobe consistent */ @@ -492,9 +519,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, static int __kprobes in_kprobes_functions(unsigned long addr) { + struct kprobe_blackpoint *kb; + if (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) return -EINVAL; + /* + * If there exists a kprobe_blacklist, verify and + * fail any probe registration in the prohibited area + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + if (kb->start_addr) { + if (addr >= kb->start_addr && + addr < (kb->start_addr + kb->range)) + return -EINVAL; + } + } return 0; } @@ -555,6 +595,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, } p->nmissed = 0; + INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -581,35 +622,28 @@ out: return ret; } -int __kprobes register_kprobe(struct kprobe *p) -{ - return __register_kprobe(p, (unsigned long)__builtin_return_address(0)); -} - -void __kprobes unregister_kprobe(struct kprobe *p) +/* + * Unregister a kprobe without a scheduler synchronization. + */ +static int __kprobes __unregister_kprobe_top(struct kprobe *p) { - struct module *mod; struct kprobe *old_p, *list_p; - int cleanup_p; - mutex_lock(&kprobe_mutex); old_p = get_kprobe(p->addr); - if (unlikely(!old_p)) { - mutex_unlock(&kprobe_mutex); - return; - } + if (unlikely(!old_p)) + return -EINVAL; + if (p != old_p) { list_for_each_entry_rcu(list_p, &old_p->list, list) if (list_p == p) /* kprobe p is a valid probe */ goto valid_p; - mutex_unlock(&kprobe_mutex); - return; + return -EINVAL; } valid_p: if (old_p == p || (old_p->pre_handler == aggr_pre_handler && - p->list.next == &old_p->list && p->list.prev == &old_p->list)) { + list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are * enabled - otherwise, the breakpoint would already have @@ -618,43 +652,98 @@ valid_p: if (kprobe_enabled) arch_disarm_kprobe(p); hlist_del_rcu(&old_p->hlist); - cleanup_p = 1; } else { + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler) { + list_for_each_entry_rcu(list_p, &old_p->list, list) { + if ((list_p != p) && (list_p->post_handler)) + goto noclean; + } + old_p->post_handler = NULL; + } +noclean: list_del_rcu(&p->list); - cleanup_p = 0; } + return 0; +} - mutex_unlock(&kprobe_mutex); +static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +{ + struct module *mod; + struct kprobe *old_p; - synchronize_sched(); if (p->mod_refcounted) { mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); } - if (cleanup_p) { - if (p != old_p) { - list_del_rcu(&p->list); + if (list_empty(&p->list) || list_is_singular(&p->list)) { + if (!list_empty(&p->list)) { + /* "p" is the last child of an aggr_kprobe */ + old_p = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); kfree(old_p); } arch_remove_kprobe(p); - } else { - mutex_lock(&kprobe_mutex); - if (p->break_handler) - old_p->break_handler = NULL; - if (p->post_handler){ - list_for_each_entry_rcu(list_p, &old_p->list, list){ - if (list_p->post_handler){ - cleanup_p = 2; - break; - } - } - if (cleanup_p == 0) - old_p->post_handler = NULL; + } +} + +static int __register_kprobes(struct kprobe **kps, int num, + unsigned long called_from) +{ + int i, ret = 0; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kprobe(kps[i], called_from); + if (ret < 0) { + if (i > 0) + unregister_kprobes(kps, i); + break; } - mutex_unlock(&kprobe_mutex); } + return ret; +} + +/* + * Registration and unregistration functions for kprobe. + */ +int __kprobes register_kprobe(struct kprobe *p) +{ + return __register_kprobes(&p, 1, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobe(struct kprobe *p) +{ + unregister_kprobes(&p, 1); +} + +int __kprobes register_kprobes(struct kprobe **kps, int num) +{ + return __register_kprobes(kps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kprobes(struct kprobe **kps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) + if (kps[i]->addr) + __unregister_kprobe_bottom(kps[i]); } static struct notifier_block kprobe_exceptions_nb = { @@ -667,24 +756,70 @@ unsigned long __weak arch_deref_entry_point(void *entry) return (unsigned long)entry; } -int __kprobes register_jprobe(struct jprobe *jp) +static int __register_jprobes(struct jprobe **jps, int num, + unsigned long called_from) { - unsigned long addr = arch_deref_entry_point(jp->entry); + struct jprobe *jp; + int ret = 0, i; - if (!kernel_text_address(addr)) + if (num <= 0) return -EINVAL; + for (i = 0; i < num; i++) { + unsigned long addr; + jp = jps[i]; + addr = arch_deref_entry_point(jp->entry); + + if (!kernel_text_address(addr)) + ret = -EINVAL; + else { + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = __register_kprobe(&jp->kp, called_from); + } + if (ret < 0) { + if (i > 0) + unregister_jprobes(jps, i); + break; + } + } + return ret; +} - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - - return __register_kprobe(&jp->kp, +int __kprobes register_jprobe(struct jprobe *jp) +{ + return __register_jprobes(&jp, 1, (unsigned long)__builtin_return_address(0)); } void __kprobes unregister_jprobe(struct jprobe *jp) { - unregister_kprobe(&jp->kp); + unregister_jprobes(&jp, 1); +} + +int __kprobes register_jprobes(struct jprobe **jps, int num) +{ + return __register_jprobes(jps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_jprobes(struct jprobe **jps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&jps[i]->kp) < 0) + jps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (jps[i]->kp.addr) + __unregister_kprobe_bottom(&jps[i]->kp); + } } #ifdef CONFIG_KRETPROBES @@ -725,7 +860,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, return 0; } -int __kprobes register_kretprobe(struct kretprobe *rp) +static int __kprobes __register_kretprobe(struct kretprobe *rp, + unsigned long called_from) { int ret = 0; struct kretprobe_instance *inst; @@ -771,46 +907,102 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->nmissed = 0; /* Establish function entry probe point */ - if ((ret = __register_kprobe(&rp->kp, - (unsigned long)__builtin_return_address(0))) != 0) + ret = __register_kprobe(&rp->kp, called_from); + if (ret != 0) free_rp_inst(rp); return ret; } +static int __register_kretprobes(struct kretprobe **rps, int num, + unsigned long called_from) +{ + int ret = 0, i; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = __register_kretprobe(rps[i], called_from); + if (ret < 0) { + if (i > 0) + unregister_kretprobes(rps, i); + break; + } + } + return ret; +} + +int __kprobes register_kretprobe(struct kretprobe *rp) +{ + return __register_kretprobes(&rp, 1, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kretprobe(struct kretprobe *rp) +{ + unregister_kretprobes(&rp, 1); +} + +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + return __register_kretprobes(rps, num, + (unsigned long)__builtin_return_address(0)); +} + +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&rps[i]->kp) < 0) + rps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (rps[i]->kp.addr) { + __unregister_kprobe_bottom(&rps[i]->kp); + cleanup_rp_inst(rps[i]); + } + } +} + #else /* CONFIG_KRETPROBES */ int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } -static int __kprobes pre_handler_kretprobe(struct kprobe *p, - struct pt_regs *regs) +int __kprobes register_kretprobes(struct kretprobe **rps, int num) { - return 0; + return -ENOSYS; } -#endif /* CONFIG_KRETPROBES */ - void __kprobes unregister_kretprobe(struct kretprobe *rp) { - unsigned long flags; - struct kretprobe_instance *ri; - struct hlist_node *pos, *next; +} - unregister_kprobe(&rp->kp); +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ +} - /* No race here */ - spin_lock_irqsave(&kretprobe_lock, flags); - hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { - ri->rp = NULL; - hlist_del(&ri->uflist); - } - spin_unlock_irqrestore(&kretprobe_lock, flags); - free_rp_inst(rp); +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; } +#endif /* CONFIG_KRETPROBES */ + static int __init init_kprobes(void) { int i, err = 0; + unsigned long offset = 0, size = 0; + char *modname, namebuf[128]; + const char *symbol_name; + void *addr; + struct kprobe_blackpoint *kb; /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ @@ -819,6 +1011,28 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + /* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + kprobe_lookup_name(kb->name, addr); + if (!addr) + continue; + + kb->start_addr = (unsigned long)addr; + symbol_name = kallsyms_lookup(kb->start_addr, + &size, &offset, &modname, namebuf); + if (!symbol_name) + kb->range = 0; + else + kb->range = size; + } + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -1066,8 +1280,12 @@ module_init(init_kprobes); EXPORT_SYMBOL_GPL(register_kprobe); EXPORT_SYMBOL_GPL(unregister_kprobe); +EXPORT_SYMBOL_GPL(register_kprobes); +EXPORT_SYMBOL_GPL(unregister_kprobes); EXPORT_SYMBOL_GPL(register_jprobe); EXPORT_SYMBOL_GPL(unregister_jprobe); +EXPORT_SYMBOL_GPL(register_jprobes); +EXPORT_SYMBOL_GPL(unregister_jprobes); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(jprobe_return); #endif @@ -1075,4 +1293,6 @@ EXPORT_SYMBOL_GPL(jprobe_return); #ifdef CONFIG_KPROBES EXPORT_SYMBOL_GPL(register_kretprobe); EXPORT_SYMBOL_GPL(unregister_kretprobe); +EXPORT_SYMBOL_GPL(register_kretprobes); +EXPORT_SYMBOL_GPL(unregister_kretprobes); #endif diff --git a/kernel/kthread.c b/kernel/kthread.c index 0ac887882f9..bd1b9ea024e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,6 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #define KTHREAD_NICE_LEVEL (-5) @@ -99,7 +98,7 @@ static void create_kthread(struct kthread_create_info *create) struct sched_param param = { .sched_priority = 0 }; wait_for_completion(&create->started); read_lock(&tasklist_lock); - create->result = find_task_by_pid(pid); + create->result = find_task_by_pid_ns(pid, &init_pid_ns); read_unlock(&tasklist_lock); /* * root may have changed our (kthreadd's) priority or CPU mask. @@ -145,9 +144,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), spin_lock(&kthread_create_lock); list_add_tail(&create.list, &kthread_create_list); - wake_up_process(kthreadd_task); spin_unlock(&kthread_create_lock); + wake_up_process(kthreadd_task); wait_for_completion(&create.done); if (!IS_ERR(create.result)) { @@ -180,6 +179,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) wait_task_inactive(k); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); + k->rt.nr_cpus_allowed = 1; } EXPORT_SYMBOL(kthread_bind); diff --git a/kernel/latencytop.c b/kernel/latencytop.c index b4e3c85abe7..5e7b45c5692 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -64,8 +64,8 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record return; for (i = 0; i < MAXLR; i++) { - int q; - int same = 1; + int q, same = 1; + /* Nothing stored: */ if (!latency_record[i].backtrace[0]) { if (firstnonnull > i) @@ -73,12 +73,15 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record continue; } for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (latency_record[i].backtrace[q] != - lat->backtrace[q]) + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { same = 0; - if (same && lat->backtrace[q] == 0) break; - if (same && lat->backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { @@ -143,14 +146,18 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) for (i = 0; i < LT_SAVECOUNT ; i++) { struct latency_record *mylat; int same = 1; + mylat = &tsk->latency_record[i]; for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { - if (mylat->backtrace[q] != - lat.backtrace[q]) + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { same = 0; - if (same && lat.backtrace[q] == 0) break; - if (same && lat.backtrace[q] == ULONG_MAX) + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) break; } if (same) { @@ -226,14 +233,7 @@ static struct file_operations lstats_fops = { static int __init init_lstats_procfs(void) { - struct proc_dir_entry *pe; - - pe = create_proc_entry("latency_stats", 0644, NULL); - if (!pe) - return -ENOMEM; - - pe->proc_fops = &lstats_fops; - + proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } __initcall(init_lstats_procfs); diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 8a135bd163c..dc5d29648d8 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -660,20 +660,12 @@ static const struct file_operations proc_lock_stat_operations = { static int __init lockdep_proc_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("lockdep", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lockdep_operations; - - entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lockdep_stats_operations; + proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); + proc_create("lockdep_stats", S_IRUSR, NULL, + &proc_lockdep_stats_operations); #ifdef CONFIG_LOCK_STAT - entry = create_proc_entry("lock_stat", S_IRUSR, NULL); - if (entry) - entry->proc_fops = &proc_lock_stat_operations; + proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); #endif return 0; diff --git a/kernel/marker.c b/kernel/marker.c index 041c33e3e95..b5a9fe1d50d 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -23,12 +23,13 @@ #include <linux/rcupdate.h> #include <linux/marker.h> #include <linux/err.h> +#include <linux/slab.h> extern struct marker __start___markers[]; extern struct marker __stop___markers[]; /* Set to 1 to enable marker debug output */ -const int marker_debug; +static const int marker_debug; /* * markers_mutex nests inside module_mutex. Markers mutex protects the builtin @@ -671,6 +672,9 @@ int marker_probe_register(const char *name, const char *format, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); @@ -714,6 +718,9 @@ int marker_probe_unregister(const char *name, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ ret = 0; @@ -792,6 +799,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: diff --git a/kernel/module.c b/kernel/module.c index 5d437bffd8d..5f80478b746 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,7 +43,6 @@ #include <linux/mutex.h> #include <linux/unwind.h> #include <asm/uaccess.h> -#include <asm/semaphore.h> #include <asm/cacheflush.h> #include <linux/license.h> #include <asm/sections.h> @@ -165,131 +164,140 @@ static const struct kernel_symbol *lookup_symbol(const char *name, return NULL; } -static void printk_unused_warning(const char *name) +static bool always_ok(bool gplok, bool warn, const char *name) { - printk(KERN_WARNING "Symbol %s is marked as UNUSED, " - "however this module is using it.\n", name); - printk(KERN_WARNING "This symbol will go away in the future.\n"); - printk(KERN_WARNING "Please evalute if this is the right api to use, " - "and if it really is, submit a report the linux kernel " - "mailinglist together with submitting your code for " - "inclusion.\n"); + return true; } -/* Find a symbol, return value, crc and module which owns it */ -static unsigned long __find_symbol(const char *name, - struct module **owner, - const unsigned long **crc, - int gplok) +static bool printk_unused_warning(bool gplok, bool warn, const char *name) { - struct module *mod; - const struct kernel_symbol *ks; - - /* Core kernel first. */ - *owner = NULL; - ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); - if (ks) { - *crc = symversion(__start___kcrctab, (ks - __start___ksymtab)); - return ks->value; + if (warn) { + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", name); + printk(KERN_WARNING + "This symbol will go away in the future.\n"); + printk(KERN_WARNING + "Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); } - if (gplok) { - ks = lookup_symbol(name, __start___ksymtab_gpl, - __stop___ksymtab_gpl); - if (ks) { - *crc = symversion(__start___kcrctab_gpl, - (ks - __start___ksymtab_gpl)); - return ks->value; - } - } - ks = lookup_symbol(name, __start___ksymtab_gpl_future, - __stop___ksymtab_gpl_future); - if (ks) { - if (!gplok) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more " - "details.\n"); - } - *crc = symversion(__start___kcrctab_gpl_future, - (ks - __start___ksymtab_gpl_future)); - return ks->value; + return true; +} + +static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name) +{ + if (!gplok) + return false; + return printk_unused_warning(gplok, warn, name); +} + +static bool gpl_only(bool gplok, bool warn, const char *name) +{ + return gplok; +} + +static bool warn_if_not_gpl(bool gplok, bool warn, const char *name) +{ + if (!gplok && warn) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more details.\n"); } + return true; +} - ks = lookup_symbol(name, __start___ksymtab_unused, - __stop___ksymtab_unused); - if (ks) { - printk_unused_warning(name); - *crc = symversion(__start___kcrctab_unused, - (ks - __start___ksymtab_unused)); - return ks->value; +struct symsearch { + const struct kernel_symbol *start, *stop; + const unsigned long *crcs; + bool (*check)(bool gplok, bool warn, const char *name); +}; + +/* Look through this array of symbol tables for a symbol match which + * passes the check function. */ +static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, + unsigned int num, + const char *name, + bool gplok, + bool warn, + const unsigned long **crc) +{ + unsigned int i; + const struct kernel_symbol *ks; + + for (i = 0; i < num; i++) { + ks = lookup_symbol(name, arr[i].start, arr[i].stop); + if (!ks || !arr[i].check(gplok, warn, name)) + continue; + + if (crc) + *crc = symversion(arr[i].crcs, ks - arr[i].start); + return ks; } + return NULL; +} - if (gplok) - ks = lookup_symbol(name, __start___ksymtab_unused_gpl, - __stop___ksymtab_unused_gpl); +/* Find a symbol, return value, (optional) crc and (optional) module + * which owns it */ +static unsigned long find_symbol(const char *name, + struct module **owner, + const unsigned long **crc, + bool gplok, + bool warn) +{ + struct module *mod; + const struct kernel_symbol *ks; + const struct symsearch arr[] = { + { __start___ksymtab, __stop___ksymtab, __start___kcrctab, + always_ok }, + { __start___ksymtab_gpl, __stop___ksymtab_gpl, + __start___kcrctab_gpl, gpl_only }, + { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, + __start___kcrctab_gpl_future, warn_if_not_gpl }, + { __start___ksymtab_unused, __stop___ksymtab_unused, + __start___kcrctab_unused, printk_unused_warning }, + { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, + __start___kcrctab_unused_gpl, gpl_only_unused_warning }, + }; + + /* Core kernel first. */ + ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); if (ks) { - printk_unused_warning(name); - *crc = symversion(__start___kcrctab_unused_gpl, - (ks - __start___ksymtab_unused_gpl)); + if (owner) + *owner = NULL; return ks->value; } /* Now try modules. */ list_for_each_entry(mod, &modules, list) { - *owner = mod; - ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - if (ks) { - *crc = symversion(mod->crcs, (ks - mod->syms)); - return ks->value; - } - - if (gplok) { - ks = lookup_symbol(name, mod->gpl_syms, - mod->gpl_syms + mod->num_gpl_syms); - if (ks) { - *crc = symversion(mod->gpl_crcs, - (ks - mod->gpl_syms)); - return ks->value; - } - } - ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms); + struct symsearch arr[] = { + { mod->syms, mod->syms + mod->num_syms, mod->crcs, + always_ok }, + { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, + mod->gpl_crcs, gpl_only }, + { mod->gpl_future_syms, + mod->gpl_future_syms + mod->num_gpl_future_syms, + mod->gpl_future_crcs, warn_if_not_gpl }, + { mod->unused_syms, + mod->unused_syms + mod->num_unused_syms, + mod->unused_crcs, printk_unused_warning }, + { mod->unused_gpl_syms, + mod->unused_gpl_syms + mod->num_unused_gpl_syms, + mod->unused_gpl_crcs, gpl_only_unused_warning }, + }; + + ks = search_symarrays(arr, ARRAY_SIZE(arr), + name, gplok, warn, crc); if (ks) { - printk_unused_warning(name); - *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms)); - return ks->value; - } - - if (gplok) { - ks = lookup_symbol(name, mod->unused_gpl_syms, - mod->unused_gpl_syms + mod->num_unused_gpl_syms); - if (ks) { - printk_unused_warning(name); - *crc = symversion(mod->unused_gpl_crcs, - (ks - mod->unused_gpl_syms)); - return ks->value; - } - } - ks = lookup_symbol(name, mod->gpl_future_syms, - (mod->gpl_future_syms + - mod->num_gpl_future_syms)); - if (ks) { - if (!gplok) { - printk(KERN_WARNING "Symbol %s is being used " - "by a non-GPL module, which will not " - "be allowed in the future\n", name); - printk(KERN_WARNING "Please see the file " - "Documentation/feature-removal-schedule.txt " - "in the kernel source tree for more " - "details.\n"); - } - *crc = symversion(mod->gpl_future_crcs, - (ks - mod->gpl_future_syms)); + if (owner) + *owner = mod; return ks->value; } } + DEBUGP("Failed to find symbol %s\n", name); return -ENOENT; } @@ -664,7 +672,7 @@ static void free_module(struct module *mod); static void wait_for_zero_refcount(struct module *mod) { - /* Since we might sleep for some time, drop the semaphore first */ + /* Since we might sleep for some time, release the mutex first */ mutex_unlock(&module_mutex); for (;;) { DEBUGP("Looking at refcount...\n"); @@ -737,12 +745,13 @@ sys_delete_module(const char __user *name_user, unsigned int flags) if (!forced && module_refcount(mod) != 0) wait_for_zero_refcount(mod); + mutex_unlock(&module_mutex); /* Final destruction now noone is using it. */ - if (mod->exit != NULL) { - mutex_unlock(&module_mutex); + if (mod->exit != NULL) mod->exit(); - mutex_lock(&module_mutex); - } + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); free_module(mod); @@ -778,10 +787,9 @@ static void print_unload_info(struct seq_file *m, struct module *mod) void __symbol_put(const char *symbol) { struct module *owner; - const unsigned long *crc; preempt_disable(); - if (IS_ERR_VALUE(__find_symbol(symbol, &owner, &crc, 1))) + if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) BUG(); module_put(owner); preempt_enable(); @@ -882,6 +890,19 @@ static struct module_attribute *modinfo_attrs[] = { static const char vermagic[] = VERMAGIC_STRING; +static int try_to_force_load(struct module *mod, const char *symname) +{ +#ifdef CONFIG_MODULE_FORCE_LOAD + if (!(tainted & TAINT_FORCED_MODULE)) + printk("%s: no version for \"%s\" found: kernel tainted.\n", + mod->name, symname); + add_taint_module(mod, TAINT_FORCED_MODULE); + return 0; +#else + return -ENOEXEC; +#endif +} + #ifdef CONFIG_MODVERSIONS static int check_version(Elf_Shdr *sechdrs, unsigned int versindex, @@ -896,6 +917,10 @@ static int check_version(Elf_Shdr *sechdrs, if (!crc) return 1; + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + versions = (void *) sechdrs[versindex].sh_addr; num_versions = sechdrs[versindex].sh_size / sizeof(struct modversion_info); @@ -906,18 +931,19 @@ static int check_version(Elf_Shdr *sechdrs, if (versions[i].crc == *crc) return 1; - printk("%s: disagrees about version of symbol %s\n", - mod->name, symname); DEBUGP("Found checksum %lX vs module %lX\n", *crc, versions[i].crc); - return 0; + goto bad_version; } - /* Not in module's version table. OK, but that taints the kernel. */ - if (!(tainted & TAINT_FORCED_MODULE)) - printk("%s: no version for \"%s\" found: kernel tainted.\n", - mod->name, symname); - add_taint_module(mod, TAINT_FORCED_MODULE); - return 1; + + printk(KERN_WARNING "%s: no symbol version for %s\n", + mod->name, symname); + return 0; + +bad_version: + printk("%s: disagrees about version of symbol %s\n", + mod->name, symname); + return 0; } static inline int check_modstruct_version(Elf_Shdr *sechdrs, @@ -925,20 +951,20 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, struct module *mod) { const unsigned long *crc; - struct module *owner; - if (IS_ERR_VALUE(__find_symbol("struct_module", - &owner, &crc, 1))) + if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) BUG(); - return check_version(sechdrs, versindex, "struct_module", mod, - crc); + return check_version(sechdrs, versindex, "struct_module", mod, crc); } -/* First part is kernel version, which we ignore. */ -static inline int same_magic(const char *amagic, const char *bmagic) +/* First part is kernel version, which we ignore if module has crcs. */ +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) { - amagic += strcspn(amagic, " "); - bmagic += strcspn(bmagic, " "); + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } return strcmp(amagic, bmagic) == 0; } #else @@ -958,7 +984,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, return 1; } -static inline int same_magic(const char *amagic, const char *bmagic) +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) { return strcmp(amagic, bmagic) == 0; } @@ -975,8 +1002,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, unsigned long ret; const unsigned long *crc; - ret = __find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE)); + ret = find_symbol(name, &owner, &crc, + !(mod->taints & TAINT_PROPRIETARY_MODULE), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -992,6 +1019,20 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, * J. Corbet <corbet@lwn.net> */ #if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) +struct module_sect_attr +{ + struct module_attribute mattr; + char *name; + unsigned long address; +}; + +struct module_sect_attrs +{ + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[0]; +}; + static ssize_t module_sect_show(struct module_attribute *mattr, struct module *mod, char *buf) { @@ -1002,7 +1043,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, static void free_sect_attrs(struct module_sect_attrs *sect_attrs) { - int section; + unsigned int section; for (section = 0; section < sect_attrs->nsections; section++) kfree(sect_attrs->attrs[section].name); @@ -1296,7 +1337,19 @@ out_unreg: kobject_put(&mod->mkobj.kobj); return err; } -#endif + +static void mod_sysfs_fini(struct module *mod) +{ + kobject_put(&mod->mkobj.kobj); +} + +#else /* CONFIG_SYSFS */ + +static void mod_sysfs_fini(struct module *mod) +{ +} + +#endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { @@ -1304,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod) module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); kobject_put(mod->holders_dir); - kobject_put(&mod->mkobj.kobj); + mod_sysfs_fini(mod); } /* @@ -1363,10 +1416,9 @@ void *__symbol_get(const char *symbol) { struct module *owner; unsigned long value; - const unsigned long *crc; preempt_disable(); - value = __find_symbol(symbol, &owner, &crc, 1); + value = find_symbol(symbol, &owner, NULL, true, true); if (IS_ERR_VALUE(value)) value = 0; else if (strong_try_module_get(owner)) @@ -1383,33 +1435,33 @@ EXPORT_SYMBOL_GPL(__symbol_get); */ static int verify_export_symbols(struct module *mod) { - const char *name = NULL; - unsigned long i, ret = 0; + unsigned int i; struct module *owner; - const unsigned long *crc; - - for (i = 0; i < mod->num_syms; i++) - if (!IS_ERR_VALUE(__find_symbol(mod->syms[i].name, - &owner, &crc, 1))) { - name = mod->syms[i].name; - ret = -ENOEXEC; - goto dup; - } + const struct kernel_symbol *s; + struct { + const struct kernel_symbol *sym; + unsigned int num; + } arr[] = { + { mod->syms, mod->num_syms }, + { mod->gpl_syms, mod->num_gpl_syms }, + { mod->gpl_future_syms, mod->num_gpl_future_syms }, + { mod->unused_syms, mod->num_unused_syms }, + { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, + }; - for (i = 0; i < mod->num_gpl_syms; i++) - if (!IS_ERR_VALUE(__find_symbol(mod->gpl_syms[i].name, - &owner, &crc, 1))) { - name = mod->gpl_syms[i].name; - ret = -ENOEXEC; - goto dup; + for (i = 0; i < ARRAY_SIZE(arr); i++) { + for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { + if (!IS_ERR_VALUE(find_symbol(s->name, &owner, + NULL, true, false))) { + printk(KERN_ERR + "%s: exports duplicate symbol %s" + " (owned by %s)\n", + mod->name, s->name, module_name(owner)); + return -ENOEXEC; + } } - -dup: - if (ret) - printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", - mod->name, name, module_name(owner)); - - return ret; + } + return 0; } /* Change all symbols so that st_value encodes the pointer directly. */ @@ -1740,7 +1792,7 @@ static struct module *load_module(void __user *umod, /* Sanity checks against insmoding binaries or wrong arch, weird elf version */ - if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) || hdr->e_shentsize != sizeof(*sechdrs)) { @@ -1815,8 +1867,9 @@ static struct module *load_module(void __user *umod, unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); #endif - /* Don't keep modinfo section */ + /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; #ifdef CONFIG_KALLSYMS /* Keep symbol and string tables for decoding later. */ sechdrs[symindex].sh_flags |= SHF_ALLOC; @@ -1834,10 +1887,10 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - add_taint_module(mod, TAINT_FORCED_MODULE); - printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", - mod->name); - } else if (!same_magic(modmagic, vermagic)) { + err = try_to_force_load(mod, "magic"); + if (err) + goto free_hdr; + } else if (!same_magic(modmagic, vermagic, versindex)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); err = -ENOEXEC; @@ -1978,7 +2031,8 @@ static struct module *load_module(void __user *umod, mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; if (unusedgplcrcindex) - mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr; + mod->unused_gpl_crcs + = (void *)sechdrs[unusedgplcrcindex].sh_addr; #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !crcindex) || @@ -1986,9 +2040,10 @@ static struct module *load_module(void __user *umod, (mod->num_gpl_future_syms && !gplfuturecrcindex) || (mod->num_unused_syms && !unusedcrcindex) || (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { - printk(KERN_WARNING "%s: No versions for exported symbols." - " Tainting kernel.\n", mod->name); - add_taint_module(mod, TAINT_FORCED_MODULE); + printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); + err = try_to_force_load(mod, "nocrc"); + if (err) + goto cleanup; } #endif markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); @@ -2172,6 +2227,8 @@ sys_init_module(void __user *umod, mod->state = MODULE_STATE_GOING; synchronize_sched(); module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); mutex_lock(&module_mutex); free_module(mod); mutex_unlock(&module_mutex); diff --git a/kernel/notifier.c b/kernel/notifier.c index 643360d1bb1..823be11584e 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -31,6 +31,21 @@ static int notifier_chain_register(struct notifier_block **nl, return 0; } +static int notifier_chain_cond_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) + return 0; + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + static int notifier_chain_unregister(struct notifier_block **nl, struct notifier_block *n) { @@ -205,6 +220,29 @@ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); /** + * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain, only if not already + * present in the chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + down_write(&nh->rwsem); + ret = notifier_chain_cond_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); + +/** * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain * @nh: Pointer to head of the blocking notifier chain * @n: Entry to remove from notifier chain diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index aead4d69f62..48d7ed6fc3a 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -7,6 +7,8 @@ #include <linux/module.h> #include <linux/cgroup.h> #include <linux/fs.h> +#include <linux/slab.h> +#include <linux/nsproxy.h> struct ns_cgroup { struct cgroup_subsys_state css; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f5d332cf8c6..adc785146a1 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -139,6 +139,18 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) goto out; } + /* + * CLONE_NEWIPC must detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. In clone parlance, CLONE_SYSVSEM + * means share undolist with parent, so we must forbid using + * it along with CLONE_NEWIPC. + */ + if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { + err = -EINVAL; + goto out; + } + new_ns = create_new_namespaces(flags, tsk, tsk->fs); if (IS_ERR(new_ns)) { err = PTR_ERR(new_ns); diff --git a/kernel/panic.c b/kernel/panic.c index 24af9f8bac9..425567f45b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -153,6 +153,8 @@ EXPORT_SYMBOL(panic); * 'M' - System experienced a machine check exception. * 'B' - System has hit bad_page. * 'U' - Userspace-defined naughtiness. + * 'A' - ACPI table overridden. + * 'W' - Taint on warning. * * The string is overwritten by the next call to print_taint(). */ @@ -161,7 +163,7 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', @@ -170,7 +172,8 @@ const char *print_tainted(void) tainted & TAINT_BAD_PAGE ? 'B' : ' ', tainted & TAINT_USER ? 'U' : ' ', tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); + tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', + tainted & TAINT_WARN ? 'W' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); @@ -312,6 +315,7 @@ void warn_on_slowpath(const char *file, int line) print_modules(); dump_stack(); print_oops_end_marker(); + add_taint(TAINT_WARN); } EXPORT_SYMBOL(warn_on_slowpath); #endif diff --git a/kernel/pid.c b/kernel/pid.c index 477691576b3..20d59fa2d49 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -111,10 +111,11 @@ EXPORT_SYMBOL(is_container_init); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct pid_namespace *pid_ns, int pid) +static void free_pidmap(struct upid *upid) { - struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; - int offset = pid & BITS_PER_PAGE_MASK; + int nr = upid->nr; + struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; + int offset = nr & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); @@ -232,7 +233,7 @@ void free_pid(struct pid *pid) spin_unlock_irqrestore(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); + free_pidmap(pid->numbers + i); call_rcu(&pid->rcu, delayed_put_pid); } @@ -278,8 +279,8 @@ out: return pid; out_free: - for (i++; i <= ns->level; i++) - free_pidmap(pid->numbers[i].ns, pid->numbers[i].nr); + while (++i <= ns->level) + free_pidmap(pid->numbers + i); kmem_cache_free(ns->pid_cachep, pid); pid = NULL; @@ -316,7 +317,7 @@ EXPORT_SYMBOL_GPL(find_pid); /* * attach_pid() must be called with the tasklist_lock write-held. */ -int attach_pid(struct task_struct *task, enum pid_type type, +void attach_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { struct pid_link *link; @@ -324,11 +325,10 @@ int attach_pid(struct task_struct *task, enum pid_type type, link = &task->pids[type]; link->pid = pid; hlist_add_head_rcu(&link->node, &pid->tasks[type]); - - return 0; } -void detach_pid(struct task_struct *task, enum pid_type type) +static void __change_pid(struct task_struct *task, enum pid_type type, + struct pid *new) { struct pid_link *link; struct pid *pid; @@ -338,7 +338,7 @@ void detach_pid(struct task_struct *task, enum pid_type type) pid = link->pid; hlist_del_rcu(&link->node); - link->pid = NULL; + link->pid = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (!hlist_empty(&pid->tasks[tmp])) @@ -347,13 +347,24 @@ void detach_pid(struct task_struct *task, enum pid_type type) free_pid(pid); } +void detach_pid(struct task_struct *task, enum pid_type type) +{ + __change_pid(task, type, NULL); +} + +void change_pid(struct task_struct *task, enum pid_type type, + struct pid *pid) +{ + __change_pid(task, type, pid); + attach_pid(task, type, pid); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { new->pids[type].pid = old->pids[type].pid; hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); - old->pids[type].pid = NULL; } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -380,12 +391,6 @@ struct task_struct *find_task_by_pid_type_ns(int type, int nr, EXPORT_SYMBOL(find_task_by_pid_type_ns); -struct task_struct *find_task_by_pid(pid_t nr) -{ - return find_task_by_pid_type_ns(PIDTYPE_PID, nr, &init_pid_ns); -} -EXPORT_SYMBOL(find_task_by_pid); - struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6d792b66d85..98702b4b885 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -66,7 +66,7 @@ err_alloc: return NULL; } -static struct pid_namespace *create_pid_namespace(int level) +static struct pid_namespace *create_pid_namespace(unsigned int level) { struct pid_namespace *ns; int i; @@ -92,7 +92,7 @@ static struct pid_namespace *create_pid_namespace(int level) atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); for (i = 1; i < PIDMAP_ENTRIES; i++) { - ns->pidmap[i].page = 0; + ns->pidmap[i].page = NULL; atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 2eae91f954c..f1525ad06cb 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -4,8 +4,9 @@ #include <linux/sched.h> #include <linux/posix-timers.h> -#include <asm/uaccess.h> #include <linux/errno.h> +#include <linux/math64.h> +#include <asm/uaccess.h> static int check_clock(const clockid_t which_clock) { @@ -47,12 +48,10 @@ static void sample_to_timespec(const clockid_t which_clock, union cpu_time_count cpu, struct timespec *tp) { - if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - tp->tv_sec = div_long_long_rem(cpu.sched, - NSEC_PER_SEC, &tp->tv_nsec); - } else { + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) + *tp = ns_to_timespec(cpu.sched); + else cputime_to_timespec(cpu.cpu, tp); - } } static inline int cpu_time_before(const clockid_t which_clock, @@ -1087,45 +1086,45 @@ static void check_process_timers(struct task_struct *tsk, maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { - prof_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + prof_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; virt_expires = cputime_zero; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { - virt_expires = t->expires.cpu; + if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + virt_expires = tl->expires.cpu; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } ++timers; maxfire = 20; sched_expires = 0; while (!list_empty(timers)) { - struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || sum_sched_runtime < t->expires.sched) { - sched_expires = t->expires.sched; + if (!--maxfire || sum_sched_runtime < tl->expires.sched) { + sched_expires = tl->expires.sched; break; } - t->firing = 1; - list_move_tail(&t->entry, firing); + tl->firing = 1; + list_move_tail(&tl->entry, firing); } /* diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a9b04203a66..dbd8398ddb0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -37,7 +37,6 @@ #include <linux/mutex.h> #include <asm/uaccess.h> -#include <asm/semaphore.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> @@ -311,8 +310,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private) if (timr->it_sigev_notify & SIGEV_THREAD_ID) { struct task_struct *leader; - int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + int ret = send_sigqueue(timr->sigq, timr->it_process, 0); if (likely(ret >= 0)) return ret; @@ -323,8 +321,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private) timr->it_process = leader; } - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + return send_sigqueue(timr->sigq, timr->it_process, 1); } EXPORT_SYMBOL_GPL(posix_timer_event); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6233f3b4ae6..b45da40e8d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -19,16 +19,6 @@ config PM will issue the hlt instruction if nothing is to be done, thereby sending the processor to sleep and saving power. -config PM_LEGACY - bool "Legacy Power Management API (DEPRECATED)" - depends on PM - default n - ---help--- - Support for pm_register() and friends. This old API is obsoleted - by the driver model. - - If unsure, say N. - config PM_DEBUG bool "Power Management Debug Support" depends on PM diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f7dfff28ecd..597823b5b70 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -4,7 +4,6 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := main.o -obj-$(CONFIG_PM_LEGACY) += pm.o obj-$(CONFIG_PM_SLEEP) += process.o console.o obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 89bcf4973ee..b8628be2a46 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -7,17 +7,39 @@ #include <linux/vt_kern.h> #include <linux/kbd_kern.h> #include <linux/console.h> +#include <linux/module.h> #include "power.h" #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static int disable_vt_switch; + +/* + * Normally during a suspend, we allocate a new console and switch to it. + * When we resume, we switch back to the original console. This switch + * can be slow, so on systems where the framebuffer can handle restoration + * of video registers anyways, there's little point in doing the console + * switch. This function allows you to disable it by passing it '0'. + */ +void pm_set_vt_switch(int do_switch) +{ + acquire_console_sem(); + disable_vt_switch = !do_switch; + release_console_sem(); +} +EXPORT_SYMBOL(pm_set_vt_switch); int pm_prepare_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return 0; + } + orig_fgconsole = fg_console; if (vc_allocate(SUSPEND_CONSOLE)) { @@ -50,9 +72,12 @@ int pm_prepare_console(void) void pm_restore_console(void) { acquire_console_sem(); + if (disable_vt_switch) { + release_console_sem(); + return; + } set_console(orig_fgconsole); release_console_sem(); kmsg_redirect = orig_kmsg; - return; } #endif diff --git a/kernel/power/pm.c b/kernel/power/pm.c deleted file mode 100644 index 60c73fa670d..00000000000 --- a/kernel/power/pm.c +++ /dev/null @@ -1,205 +0,0 @@ -/* - * pm.c - Power management interface - * - * Copyright (C) 2000 Andrew Henroid - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/pm.h> -#include <linux/pm_legacy.h> -#include <linux/interrupt.h> -#include <linux/mutex.h> - -/* - * Locking notes: - * pm_devs_lock can be a semaphore providing pm ops are not called - * from an interrupt handler (already a bad idea so no change here). Each - * change must be protected so that an unlink of an entry doesn't clash - * with a pm send - which is permitted to sleep in the current architecture - * - * Module unloads clashing with pm events now work out safely, the module - * unload path will block until the event has been sent. It may well block - * until a resume but that will be fine. - */ - -static DEFINE_MUTEX(pm_devs_lock); -static LIST_HEAD(pm_devs); - -/** - * pm_register - register a device with power management - * @type: device type - * @id: device ID - * @callback: callback function - * - * Add a device to the list of devices that wish to be notified about - * power management events. A &pm_dev structure is returned on success, - * on failure the return is %NULL. - * - * The callback function will be called in process context and - * it may sleep. - */ - -struct pm_dev *pm_register(pm_dev_t type, - unsigned long id, - pm_callback callback) -{ - struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); - if (dev) { - dev->type = type; - dev->id = id; - dev->callback = callback; - - mutex_lock(&pm_devs_lock); - list_add(&dev->entry, &pm_devs); - mutex_unlock(&pm_devs_lock); - } - return dev; -} - -/** - * pm_send - send request to a single device - * @dev: device to send to - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a given device. The - * %PM_SUSPEND and %PM_RESUME events are handled specially. The - * data field must hold the intended next state. No call is made - * if the state matches. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - * - * WARNING: Calling pm_send directly is not generally recommended, in - * particular there is no locking against the pm_dev going away. The - * caller must maintain all needed locking or have 'inside knowledge' - * on the safety. Also remember that this function is not locked against - * pm_unregister. This means that you must handle SMP races on callback - * execution and unload yourself. - */ - -static int pm_send(struct pm_dev *dev, pm_request_t rqst, void *data) -{ - int status = 0; - unsigned long prev_state, next_state; - - if (in_interrupt()) - BUG(); - - switch (rqst) { - case PM_SUSPEND: - case PM_RESUME: - prev_state = dev->state; - next_state = (unsigned long) data; - if (prev_state != next_state) { - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - if (!status) { - dev->state = next_state; - dev->prev_state = prev_state; - } - } - else { - dev->prev_state = prev_state; - } - break; - default: - if (dev->callback) - status = (*dev->callback)(dev, rqst, data); - break; - } - return status; -} - -/* - * Undo incomplete request - */ -static void pm_undo_all(struct pm_dev *last) -{ - struct list_head *entry = last->entry.prev; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->state != dev->prev_state) { - /* previous state was zero (running) resume or - * previous state was non-zero (suspended) suspend - */ - pm_request_t undo = (dev->prev_state - ? PM_SUSPEND:PM_RESUME); - pm_send(dev, undo, (void*) dev->prev_state); - } - entry = entry->prev; - } -} - -/** - * pm_send_all - send request to all managed devices - * @rqst: power management request - * @data: data for the callback - * - * Issue a power management request to a all devices. The - * %PM_SUSPEND events are handled specially. Any device is - * permitted to fail a suspend by returning a non zero (error) - * value from its callback function. If any device vetoes a - * suspend request then all other devices that have suspended - * during the processing of this request are restored to their - * previous state. - * - * WARNING: This function takes the pm_devs_lock. The lock is not dropped until - * the callbacks have completed. This prevents races against pm locking - * functions, races against module unload pm_unregister code. It does - * mean however that you must not issue pm_ functions within the callback - * or you will deadlock and users will hate you. - * - * Zero is returned on success. If a suspend fails then the status - * from the device that vetoes the suspend is returned. - * - * BUGS: what stops two power management requests occurring in parallel - * and conflicting. - */ - -int pm_send_all(pm_request_t rqst, void *data) -{ - struct list_head *entry; - - mutex_lock(&pm_devs_lock); - entry = pm_devs.next; - while (entry != &pm_devs) { - struct pm_dev *dev = list_entry(entry, struct pm_dev, entry); - if (dev->callback) { - int status = pm_send(dev, rqst, data); - if (status) { - /* return devices to previous state on - * failed suspend request - */ - if (rqst == PM_SUSPEND) - pm_undo_all(dev); - mutex_unlock(&pm_devs_lock); - return status; - } - } - entry = entry->next; - } - mutex_unlock(&pm_devs_lock); - return 0; -} - -EXPORT_SYMBOL(pm_register); -EXPORT_SYMBOL(pm_send_all); - diff --git a/kernel/printk.c b/kernel/printk.c index c46a20a19a1..8fb01c32aa3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -111,6 +111,9 @@ struct console_cmdline char name[8]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + char *brl_options; /* Options for braille driver */ +#endif }; #define MAX_CMDLINECONSOLES 8 @@ -643,8 +646,21 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu) { int retval = 0; - if (can_use_console(cpu)) - retval = !try_acquire_console_sem(); + if (!try_acquire_console_sem()) { + retval = 1; + + /* + * If we can't use the console, we need to release + * the console semaphore by hand to avoid flushing + * the buffer. We need to hold the console semaphore + * in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up(&console_sem); + retval = 0; + } + } printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); return retval; @@ -795,15 +811,60 @@ static void call_console_drivers(unsigned start, unsigned end) #endif +static int __add_preferred_console(char *name, int idx, char *options, + char *brl_options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + if (!brl_options) + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + if (!brl_options) + selected_console = i; + c = &console_cmdline[i]; + strlcpy(c->name, name, sizeof(c->name)); + c->options = options; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + c->brl_options = brl_options; +#endif + c->index = idx; + return 0; +} /* * Set up a list of consoles. Called from init/main.c */ static int __init console_setup(char *str) { char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ - char *s, *options; + char *s, *options, *brl_options = NULL; int idx; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (!memcmp(str, "brl,", 4)) { + brl_options = ""; + str += 4; + } else if (!memcmp(str, "brl=", 4)) { + brl_options = str + 4; + str = strchr(brl_options, ','); + if (!str) { + printk(KERN_ERR "need port name after brl=\n"); + return 1; + } + *(str++) = 0; + } +#endif + /* * Decode str into name, index, options. */ @@ -828,7 +889,7 @@ static int __init console_setup(char *str) idx = simple_strtoul(s, NULL, 10); *s = 0; - add_preferred_console(buf, idx, options); + __add_preferred_console(buf, idx, options, brl_options); return 1; } __setup("console=", console_setup); @@ -848,28 +909,7 @@ __setup("console=", console_setup); */ int add_preferred_console(char *name, int idx, char *options) { - struct console_cmdline *c; - int i; - - /* - * See if this tty is not yet registered, and - * if we have a slot free. - */ - for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) - if (strcmp(console_cmdline[i].name, name) == 0 && - console_cmdline[i].index == idx) { - selected_console = i; - return 0; - } - if (i == MAX_CMDLINECONSOLES) - return -E2BIG; - selected_console = i; - c = &console_cmdline[i]; - memcpy(c->name, name, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; - c->options = options; - c->index = idx; - return 0; + return __add_preferred_console(name, idx, options, NULL); } int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) @@ -881,7 +921,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha if (strcmp(console_cmdline[i].name, name) == 0 && console_cmdline[i].index == idx) { c = &console_cmdline[i]; - memcpy(c->name, name_new, sizeof(c->name)); + strlcpy(c->name, name_new, sizeof(c->name)); c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; @@ -1150,6 +1190,16 @@ void register_console(struct console *console) continue; if (console->index < 0) console->index = console_cmdline[i].index; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console_cmdline[i].brl_options) { + console->flags |= CON_BRL; + braille_register_console(console, + console_cmdline[i].index, + console_cmdline[i].options, + console_cmdline[i].brl_options); + return; + } +#endif if (console->setup && console->setup(console, console_cmdline[i].options) != 0) break; @@ -1208,6 +1258,11 @@ int unregister_console(struct console *console) struct console *a, *b; int res = 1; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console->flags & CON_BRL) + return braille_unregister_console(console); +#endif + acquire_console_sem(); if (console_drivers == console) { console_drivers=console->next; @@ -1259,8 +1314,8 @@ late_initcall(disable_boot_consoles); */ void tty_write_message(struct tty_struct *tty, char *msg) { - if (tty && tty->driver->write) - tty->driver->write(tty, msg, strlen(msg)); + if (tty && tty->ops->write) + tty->ops->write(tty, msg, strlen(msg)); return; } @@ -1274,31 +1329,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) */ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) { - static DEFINE_SPINLOCK(ratelimit_lock); - static unsigned toks = 10 * 5 * HZ; - static unsigned long last_msg; - static int missed; - unsigned long flags; - unsigned long now = jiffies; - - spin_lock_irqsave(&ratelimit_lock, flags); - toks += now - last_msg; - last_msg = now; - if (toks > (ratelimit_burst * ratelimit_jiffies)) - toks = ratelimit_burst * ratelimit_jiffies; - if (toks >= ratelimit_jiffies) { - int lost = missed; - - missed = 0; - toks -= ratelimit_jiffies; - spin_unlock_irqrestore(&ratelimit_lock, flags); - if (lost) - printk(KERN_WARNING "printk: %d messages suppressed.\n", lost); - return 1; - } - missed++; - spin_unlock_irqrestore(&ratelimit_lock, flags); - return 0; + return __ratelimit(ratelimit_jiffies, ratelimit_burst); } EXPORT_SYMBOL(__printk_ratelimit); diff --git a/kernel/profile.c b/kernel/profile.c index 3b7a1b05512..ae7ead82cbc 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -23,7 +23,6 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <asm/sections.h> -#include <asm/semaphore.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> @@ -588,10 +587,10 @@ static int __init create_proc_profile(void) return 0; if (create_hash_tables()) return -1; - entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); + entry = proc_create("profile", S_IWUSR | S_IRUGO, + NULL, &proc_profile_operations); if (!entry) return 0; - entry->proc_fops = &proc_profile_operations; entry->size = (1+prof_len) * sizeof(atomic_t); hotcpu_notifier(profile_cpu_callback, 0); return 0; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index fdb34e86f92..6c19e94fd0a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,7 +73,7 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); child->ptrace = 0; - if (!list_empty(&child->ptrace_list)) { + if (ptrace_reparented(child)) { list_del_init(&child->ptrace_list); remove_parent(child); child->parent = child->real_parent; @@ -168,8 +168,6 @@ int ptrace_attach(struct task_struct *task) audit_ptrace(task); retval = -EPERM; - if (task->pid <= 1) - goto out; if (same_thread_group(task, current)) goto out; @@ -208,8 +206,7 @@ repeat: __ptrace_link(task, current); - force_sig_specific(SIGSTOP, task); - + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); @@ -323,9 +320,8 @@ static int ptrace_setoptions(struct task_struct *child, long data) return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; } -static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) +static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) { - siginfo_t lastinfo; int error = -ESRCH; read_lock(&tasklist_lock); @@ -333,31 +329,25 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t __user * data) error = -EINVAL; spin_lock_irq(&child->sighand->siglock); if (likely(child->last_siginfo != NULL)) { - lastinfo = *child->last_siginfo; + *info = *child->last_siginfo; error = 0; } spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!error) - return copy_siginfo_to_user(data, &lastinfo); return error; } -static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) +static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) { - siginfo_t newinfo; int error = -ESRCH; - if (copy_from_user(&newinfo, data, sizeof (siginfo_t))) - return -EFAULT; - read_lock(&tasklist_lock); if (likely(child->sighand != NULL)) { error = -EINVAL; spin_lock_irq(&child->sighand->siglock); if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = newinfo; + *child->last_siginfo = *info; error = 0; } spin_unlock_irq(&child->sighand->siglock); @@ -424,6 +414,7 @@ int ptrace_request(struct task_struct *child, long request, long addr, long data) { int ret = -EIO; + siginfo_t siginfo; switch (request) { case PTRACE_PEEKTEXT: @@ -442,12 +433,22 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GETEVENTMSG: ret = put_user(child->ptrace_message, (unsigned long __user *) data); break; + case PTRACE_GETSIGINFO: - ret = ptrace_getsiginfo(child, (siginfo_t __user *) data); + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user((siginfo_t __user *) data, + &siginfo); break; + case PTRACE_SETSIGINFO: - ret = ptrace_setsiginfo(child, (siginfo_t __user *) data); + if (copy_from_user(&siginfo, (siginfo_t __user *) data, + sizeof siginfo)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); break; + case PTRACE_DETACH: /* detach a process that was attached. */ ret = ptrace_detach(child, data); break; @@ -518,12 +519,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) { struct task_struct *child; - /* - * Tracing init is not allowed. - */ - if (pid == 1) - return ERR_PTR(-EPERM); - read_lock(&tasklist_lock); child = find_task_by_vpid(pid); if (child) @@ -539,7 +534,6 @@ struct task_struct *ptrace_get_task_struct(pid_t pid) #define arch_ptrace_attach(child) do { } while (0) #endif -#ifndef __ARCH_SYS_PTRACE asmlinkage long sys_ptrace(long request, long pid, long addr, long data) { struct task_struct *child; @@ -587,7 +581,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) unlock_kernel(); return ret; } -#endif /* __ARCH_SYS_PTRACE */ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data) { @@ -608,7 +601,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) return (copied == sizeof(data)) ? 0 : -EIO; } -#ifdef CONFIG_COMPAT +#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE #include <linux/compat.h> int compat_ptrace_request(struct task_struct *child, compat_long_t request, @@ -616,6 +609,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, { compat_ulong_t __user *datap = compat_ptr(data); compat_ulong_t word; + siginfo_t siginfo; int ret; switch (request) { @@ -638,6 +632,23 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, ret = put_user((compat_ulong_t) child->ptrace_message, datap); break; + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user32( + (struct compat_siginfo __user *) datap, + &siginfo); + break; + + case PTRACE_SETSIGINFO: + memset(&siginfo, 0, sizeof siginfo); + if (copy_siginfo_from_user32( + &siginfo, (struct compat_siginfo __user *) datap)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); + break; + default: ret = ptrace_request(child, request, addr, data); } @@ -645,7 +656,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, return ret; } -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data) { @@ -688,6 +698,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, unlock_kernel(); return ret; } -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ - -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index f4ffbd0f306..a38895a5b8e 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp, /* * Don't send IPI to itself. With irqs disabled, * rdp->cpu is the current cpu. + * + * cpu_online_map is updated by the _cpu_down() + * using stop_machine_run(). Since we're in irqs disabled + * section, stop_machine_run() is not exectuting, hence + * the cpu_online_map is stable. + * + * However, a cpu might have been offlined _just_ before + * we disabled irqs while entering here. + * And rcu subsystem might not yet have handled the CPU_DEAD + * notification, leading to the offlined cpu's bit + * being set in the rcp->cpumask. + * + * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent + * sending smp_reschedule() to an offlined CPU. */ - cpumask = rcp->cpumask; + cpus_and(cpumask, rcp->cpumask, cpu_online_map); cpu_clear(rdp->cpu, cpumask); for_each_cpu_mask(cpu, cpumask) smp_send_reschedule(cpu); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57..5e02b774070 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -217,8 +217,6 @@ long rcu_batches_completed(void) } EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - void __rcu_read_lock(void) { int idx; @@ -1007,10 +1005,10 @@ void __synchronize_sched(void) if (sched_getaffinity(0, &oldmask) < 0) oldmask = cpu_possible_map; for_each_online_cpu(cpu) { - sched_setaffinity(0, cpumask_of_cpu(cpu)); + sched_setaffinity(0, &cpumask_of_cpu(cpu)); schedule(); } - sched_setaffinity(0, oldmask); + sched_setaffinity(0, &oldmask); } EXPORT_SYMBOL_GPL(__synchronize_sched); diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index fd599829e72..33acc424667 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -45,6 +45,7 @@ #include <linux/byteorder/swabb.h> #include <linux/stat.h> #include <linux/srcu.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " @@ -723,9 +724,10 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ */ static void rcu_torture_shuffle_tasks(void) { - cpumask_t tmp_mask = CPU_MASK_ALL; + cpumask_t tmp_mask; int i; + cpus_setall(tmp_mask); get_online_cpus(); /* No point in shuffling if there is only one online CPU (ex: UP) */ @@ -737,25 +739,27 @@ static void rcu_torture_shuffle_tasks(void) if (rcu_idle_cpu != -1) cpu_clear(rcu_idle_cpu, tmp_mask); - set_cpus_allowed(current, tmp_mask); + set_cpus_allowed_ptr(current, &tmp_mask); if (reader_tasks) { for (i = 0; i < nrealreaders; i++) if (reader_tasks[i]) - set_cpus_allowed(reader_tasks[i], tmp_mask); + set_cpus_allowed_ptr(reader_tasks[i], + &tmp_mask); } if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) if (fakewriter_tasks[i]) - set_cpus_allowed(fakewriter_tasks[i], tmp_mask); + set_cpus_allowed_ptr(fakewriter_tasks[i], + &tmp_mask); } if (writer_task) - set_cpus_allowed(writer_task, tmp_mask); + set_cpus_allowed_ptr(writer_task, &tmp_mask); if (stats_task) - set_cpus_allowed(stats_task, tmp_mask); + set_cpus_allowed_ptr(stats_task, &tmp_mask); if (rcu_idle_cpu == -1) rcu_idle_cpu = num_online_cpus() - 1; diff --git a/kernel/relay.c b/kernel/relay.c index 4c035a8a248..7de644cdec4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -65,6 +65,35 @@ static struct vm_operations_struct relay_file_mmap_ops = { .close = relay_file_mmap_close, }; +/* + * allocate an array of pointers of struct page + */ +static struct page **relay_alloc_page_array(unsigned int n_pages) +{ + struct page **array; + size_t pa_size = n_pages * sizeof(struct page *); + + if (pa_size > PAGE_SIZE) { + array = vmalloc(pa_size); + if (array) + memset(array, 0, pa_size); + } else { + array = kzalloc(pa_size, GFP_KERNEL); + } + return array; +} + +/* + * free an array of pointers of struct page + */ +static void relay_free_page_array(struct page **array) +{ + if (is_vmalloc_addr(array)) + vfree(array); + else + kfree(array); +} + /** * relay_mmap_buf: - mmap channel buffer to process address space * @buf: relay channel buffer @@ -109,7 +138,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) *size = PAGE_ALIGN(*size); n_pages = *size >> PAGE_SHIFT; - buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); + buf->page_array = relay_alloc_page_array(n_pages); if (!buf->page_array) return NULL; @@ -130,7 +159,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) depopulate: for (j = 0; j < i; j++) __free_page(buf->page_array[j]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); return NULL; } @@ -189,7 +218,7 @@ static void relay_destroy_buf(struct rchan_buf *buf) vunmap(buf->start); for (i = 0; i < buf->page_count; i++) __free_page(buf->page_array[i]); - kfree(buf->page_array); + relay_free_page_array(buf->page_array); } chan->buf[buf->cpu] = NULL; kfree(buf->padding); @@ -736,7 +765,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** @@ -1056,6 +1085,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1083,6 +1116,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) @@ -1157,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in, ret = 0; spliced = 0; - while (len) { + while (len && !spliced) { ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); if (ret < 0) break; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index efbfc0fc232..d3c61b4ebef 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -10,6 +10,7 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> +#include <linux/slab.h> #include <linux/res_counter.h> #include <linux/uaccess.h> @@ -27,6 +28,8 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) } counter->usage += val; + if (counter->usage > counter->max_usage) + counter->max_usage = counter->usage; return 0; } @@ -65,6 +68,8 @@ res_counter_member(struct res_counter *counter, int member) switch (member) { case RES_USAGE: return &counter->usage; + case RES_MAX_USAGE: + return &counter->max_usage; case RES_LIMIT: return &counter->limit; case RES_FAILCNT: @@ -92,6 +97,11 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} + ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *userbuf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *st_buf, unsigned long long *val)) diff --git a/kernel/resource.c b/kernel/resource.c index 82aea814d40..74af2d7cb5a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -131,14 +131,8 @@ static const struct file_operations proc_iomem_operations = { static int __init ioresources_init(void) { - struct proc_dir_entry *entry; - - entry = create_proc_entry("ioports", 0, NULL); - if (entry) - entry->proc_fops = &proc_ioports_operations; - entry = create_proc_entry("iomem", 0, NULL); - if (entry) - entry->proc_fops = &proc_iomem_operations; + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); return 0; } __initcall(ioresources_init); @@ -486,6 +480,24 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t EXPORT_SYMBOL(adjust_resource); +/** + * resource_alignment - calculate resource's alignment + * @res: resource pointer + * + * Returns alignment on success, 0 (invalid alignment) on failure. + */ +resource_size_t resource_alignment(struct resource *res) +{ + switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { + case IORESOURCE_SIZEALIGN: + return res->end - res->start + 1; + case IORESOURCE_STARTALIGN: + return res->start; + default: + return 0; + } +} + /* * This is compatibility stuff for IO resources. * diff --git a/kernel/sched.c b/kernel/sched.c index 28c73f07efb..94ead43eda6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -66,21 +66,15 @@ #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> +#include <linux/tick.h> +#include <linux/bootmem.h> +#include <linux/debugfs.h> +#include <linux/ctype.h> #include <asm/tlb.h> #include <asm/irq_regs.h> /* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - -/* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. @@ -114,6 +108,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -137,7 +136,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0; } @@ -155,6 +154,90 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + /* nests inside the rq lock: */ + spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + #ifdef CONFIG_GROUP_SCHED #include <linux/cgroup.h> @@ -181,29 +264,39 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; }; +#ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; - -static struct sched_entity *init_sched_entity_p[NR_CPUS]; -static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; #endif #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; - -static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; -static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif +#else +#define root_task_group init_task_group #endif /* task_group_lock serializes add/remove of task groups and also changes to @@ -211,9 +304,6 @@ static struct rt_rq *init_rt_rq_p[NR_CPUS]; */ static DEFINE_SPINLOCK(task_group_lock); -/* doms_cur_mutex serializes access to doms_cur[] array */ -static DEFINE_MUTEX(doms_cur_mutex); - #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -221,23 +311,24 @@ static DEFINE_MUTEX(doms_cur_mutex); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES 2 +#define MAX_SHARES (1UL << 18) + static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif /* Default task group. * Every task in system belong to this group at bootup. */ -struct task_group init_task_group = { -#ifdef CONFIG_FAIR_GROUP_SCHED - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - .rt_se = init_sched_rt_entity_p, - .rt_rq = init_rt_rq_p, -#endif -}; +struct task_group init_task_group; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) @@ -269,21 +360,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -static inline void lock_doms_cur(void) -{ - mutex_lock(&doms_cur_mutex); -} - -static inline void unlock_doms_cur(void) -{ - mutex_unlock(&doms_cur_mutex); -} - #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_doms_cur(void) { } -static inline void unlock_doms_cur(void) { } #endif /* CONFIG_GROUP_SCHED */ @@ -297,8 +376,12 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; @@ -334,6 +417,9 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -396,6 +482,7 @@ struct rq { unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned char idle_at_tick; #ifdef CONFIG_NO_HZ + unsigned long last_tick_seen; unsigned char in_nohz_recently; #endif /* capture load from *all* tasks on this cpu: */ @@ -405,8 +492,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -428,13 +513,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -500,53 +579,6 @@ static inline int cpu_of(struct rq *rq) } /* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { - if (clock < rq->tick_timestamp + TICK_NSEC) - clock = rq->tick_timestamp + TICK_NSEC; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} - -/* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. * @@ -561,21 +593,9 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) +static inline void update_rq_clock(struct rq *rq) { - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; + rq->clock = sched_clock_cpu(cpu_of(rq)); } /* @@ -590,22 +610,137 @@ unsigned long rt_needs_cpu(int cpu) /* * Debugging: various feature bits */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + enum { - SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_WAKEUP_PREEMPT = 2, - SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_HRTICK = 8, - SCHED_FEAT_DOUBLE_TICK = 16, +#include "sched_features.h" }; +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0; +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +static ssize_t +sched_feat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + for (i = 0; sched_feat_names[i]; i++) { + len += strlen(sched_feat_names[i]); + len += 4; + } + + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; sched_feat_names[i]; i++) { + if (sysctl_sched_features & (1UL << i)) + r += sprintf(buf + r, "%s ", sched_feat_names[i]); + else + r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + return cnt; +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .read = sched_feat_read, + .write = sched_feat_write, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) /* * Number of tasks to iterate in a single balance run. @@ -627,20 +762,58 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_period < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +unsigned long long time_sync_thresh = 100000; + +static DEFINE_PER_CPU(unsigned long long, time_offset); +static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); /* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): + * Global lock which we take every now and then to synchronize + * the CPUs time. This method is not warp-safe, but it's good + * enough to synchronize slowly diverging time sources and thus + * it's good enough for tracing: */ -unsigned long long cpu_clock(int cpu) +static DEFINE_SPINLOCK(time_sync_lock); +static unsigned long long prev_global_time; + +static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) +{ + /* + * We want this inlined, to not get tracer function calls + * in this critical section: + */ + spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); + __raw_spin_lock(&time_sync_lock.raw_lock); + + if (time < prev_global_time) { + per_cpu(time_offset, cpu) += prev_global_time - time; + time = prev_global_time; + } else { + prev_global_time = time; + } + + __raw_spin_unlock(&time_sync_lock.raw_lock); + spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); + + return time; +} + +static unsigned long long __cpu_clock(int cpu) { unsigned long long now; - unsigned long flags; - struct rq *rq; /* * Only call sched_clock() if the scheduler has already been @@ -649,13 +822,32 @@ unsigned long long cpu_clock(int cpu) if (unlikely(!scheduler_running)) return 0; + now = sched_clock_cpu(cpu); + + return now; +} + +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long prev_cpu_time, time, delta_time; + unsigned long flags; + local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; + prev_cpu_time = per_cpu(prev_cpu_time, cpu); + time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); + delta_time = time-prev_cpu_time; + + if (unlikely(delta_time > time_sync_thresh)) { + time = __sync_cpu_clock(time, cpu); + per_cpu(prev_cpu_time, cpu) = time; + } local_irq_restore(flags); - return now; + return time; } EXPORT_SYMBOL_GPL(cpu_clock); @@ -804,43 +996,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - static void __resched_task(struct task_struct *p, int tif_bit); static inline void resched_task(struct task_struct *p) @@ -876,6 +1031,7 @@ static inline void resched_rq(struct rq *rq) enum { HRTICK_SET, /* re-programm hrtick_timer */ HRTICK_RESET, /* not a new slice */ + HRTICK_BLOCK, /* stop hrtick operations */ }; /* @@ -887,6 +1043,8 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; + if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } @@ -962,14 +1120,72 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); return HRTIMER_NORESTART; } -static inline void init_rq_hrtick(struct rq *rq) +#ifdef CONFIG_SMP +static void hotplug_hrtick_disable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + rq->hrtick_flags = 0; + __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); + + hrtick_clear(rq); +} + +static void hotplug_hrtick_enable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hotplug_hrtick_disable(cpu); + return NOTIFY_OK; + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hotplug_hrtick_enable(cpu); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) { rq->hrtick_flags = 0; hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -1006,6 +1222,10 @@ static inline void init_rq_hrtick(struct rq *rq) void hrtick_resched(void) { } + +static inline void init_hrtick(void) +{ +} #endif /* @@ -1052,6 +1272,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { @@ -1079,8 +1342,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* @@ -1198,11 +1466,29 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +#else /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +} +#endif + #endif /* CONFIG_SMP */ #include "sched_stats.h" @@ -1395,7 +1681,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) /* * Buddy candidates are cache hot: */ - if (&p->se == cfs_rq_of(&p->se)->next) + if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (p->sched_class != &fair_sched_class) @@ -1685,17 +1971,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + cpumask_t *tmp) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1734,7 +2020,7 @@ static int sched_balance_self(int cpu, int flag) } while (sd) { - cpumask_t span; + cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -1750,7 +2036,7 @@ static int sched_balance_self(int cpu, int flag) continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -1796,6 +2082,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + if (!sched_feat(SYNC_WAKEUPS)) + sync = 0; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; @@ -1912,6 +2201,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2631,7 +2921,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + int *sd_idle, const cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2932,7 +3222,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, cpumask_t *cpus) + unsigned long imbalance, const cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2971,15 +3261,16 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *balance, cpumask_t *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; - cpumask_t cpus = CPU_MASK_ALL; unsigned long flags; + cpus_setall(*cpus); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -2994,7 +3285,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus, balance); + cpus, balance); if (*balance == 0) goto out_balanced; @@ -3004,7 +3295,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, &cpus); + busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -3037,8 +3328,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; goto out_balanced; } @@ -3123,7 +3414,8 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + cpumask_t *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3131,7 +3423,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; - cpumask_t cpus = CPU_MASK_ALL; + + cpus_setall(*cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3146,14 +3439,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, &cpus, NULL); + &sd_idle, cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, - &cpus); + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; @@ -3175,8 +3467,8 @@ redo: spin_unlock(&busiest->lock); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; } } @@ -3210,6 +3502,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; + cpumask_t tmpmask; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3219,8 +3512,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, &tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3379,6 +3672,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t tmp; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3402,7 +3696,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -3518,7 +3812,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) */ int ilb = first_cpu(nohz.cpu_mask); - if (ilb != NR_CPUS) + if (ilb < nr_cpu_ids) resched_cpu(ilb); } } @@ -3645,8 +3939,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - return account_guest_time(p, cputime); + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime); + return; + } p->stime = cputime_add(p->stime, cputime); @@ -3710,21 +4006,13 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3801,7 +4089,7 @@ static inline void schedule_debug(struct task_struct *prev) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3876,17 +4164,15 @@ need_resched_nonpreemptible: * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } @@ -3901,9 +4187,9 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); - sched_info_switch(prev, next); - if (likely(prev != next)) { + sched_info_switch(prev, next); + rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -3938,8 +4224,6 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -3950,16 +4234,7 @@ asmlinkage void __sched preempt_schedule(void) do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; schedule(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3980,26 +4255,15 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; local_irq_enable(); schedule(); local_irq_disable(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -4134,22 +4398,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) signal_pending(current)) || (state == TASK_KILLABLE && fatal_signal_pending(current))) { - __remove_wait_queue(&x->wait, &wait); - return -ERESTARTSYS; + timeout = -ERESTARTSYS; + break; } __set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = schedule_timeout(timeout); spin_lock_irq(&x->wait.lock); - if (!timeout) { - __remove_wait_queue(&x->wait, &wait); - return timeout; - } - } while (!x->done); + } while (!x->done && timeout); __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; } x->done--; - return timeout; + return timeout ?: 1; } static long __sched @@ -4559,7 +4821,7 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -4721,9 +4983,10 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, cpumask_t new_mask) +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { cpumask_t cpus_allowed; + cpumask_t new_mask = *in_mask; struct task_struct *p; int retval; @@ -4754,13 +5017,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) if (retval) goto out_unlock; - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: - retval = set_cpus_allowed(p, new_mask); + retval = set_cpus_allowed_ptr(p, &new_mask); if (!retval) { - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); if (!cpus_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset @@ -4804,7 +5067,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } /* @@ -4920,7 +5183,6 @@ static void __cond_resched(void) } while (need_resched()); } -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && @@ -4931,7 +5193,6 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); -#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -5226,8 +5487,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else task_thread_info(idle)->preempt_count = 0; - +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -5266,7 +5530,6 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; - sysctl_sched_batch_wakeup_granularity *= factor; } #ifdef CONFIG_SMP @@ -5295,7 +5558,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) { struct migration_req req; unsigned long flags; @@ -5303,23 +5566,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (!cpus_intersects(*new_mask, cpu_online_map)) { ret = -EINVAL; goto out; } if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, &new_mask); + p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = new_mask; - p->rt.nr_cpus_allowed = cpus_weight(new_mask); + p->cpus_allowed = *new_mask; + p->rt.nr_cpus_allowed = cpus_weight(*new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), new_mask)) + if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, any_online_cpu(*new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -5332,7 +5595,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing @@ -5470,12 +5733,14 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) dest_cpu = any_online_cpu(mask); /* On any allowed CPU? */ - if (dest_cpu == NR_CPUS) + if (dest_cpu >= nr_cpu_ids) dest_cpu = any_online_cpu(p->cpus_allowed); /* No more Mr. Nice Guy. */ - if (dest_cpu == NR_CPUS) { - cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + if (dest_cpu >= nr_cpu_ids) { + cpumask_t cpus_allowed; + + cpuset_cpus_allowed_locked(p, &cpus_allowed); /* * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. @@ -5511,7 +5776,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); unsigned long flags; local_irq_save(flags); @@ -5622,6 +5887,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) next = pick_next_task(rq, rq->curr); if (!next) break; + next->sched_class->put_prev_task(rq, next); migrate_dead(dead_cpu, next); } @@ -5923,20 +6189,16 @@ void __init migration_init(void) #ifdef CONFIG_SMP -/* Number of possible processor ids */ -int nr_cpu_ids __read_mostly = NR_CPUS; -EXPORT_SYMBOL(nr_cpu_ids); - #ifdef CONFIG_SCHED_DEBUG -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + cpumask_t *groupmask) { struct sched_group *group = sd->groups; - cpumask_t groupmask; - char str[NR_CPUS]; + char str[256]; - cpumask_scnprintf(str, NR_CPUS, sd->span); - cpus_clear(groupmask); + cpulist_scnprintf(str, sizeof(str), sd->span); + cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -5980,25 +6242,25 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) break; } - if (cpus_intersects(groupmask, group->cpumask)) { + if (cpus_intersects(*groupmask, group->cpumask)) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(groupmask, groupmask, group->cpumask); + cpus_or(*groupmask, *groupmask, group->cpumask); - cpumask_scnprintf(str, NR_CPUS, group->cpumask); + cpulist_scnprintf(str, sizeof(str), group->cpumask); printk(KERN_CONT " %s", str); group = group->next; } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, groupmask)) + if (!cpus_equal(sd->span, *groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6006,6 +6268,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) static void sched_domain_debug(struct sched_domain *sd, int cpu) { + cpumask_t *groupmask; int level = 0; if (!sd) { @@ -6015,14 +6278,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!groupmask) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + for (;;) { - if (sched_domain_debug_one(sd, cpu, level)) + if (sched_domain_debug_one(sd, cpu, level, groupmask)) break; level++; sd = sd->parent; if (!sd) break; } + kfree(groupmask); } #else # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6210,30 +6480,33 @@ __setup("isolcpus=", isolated_cpu_setup); * and ->cpu_power to 0. */ static void -init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg)) + struct sched_group **sg, + cpumask_t *tmpmask), + cpumask_t *covered, cpumask_t *tmpmask) { struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; int i; - for_each_cpu_mask(i, span) { + cpus_clear(*covered); + + for_each_cpu_mask(i, *span) { struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg); + int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, covered)) + if (cpu_isset(i, *covered)) continue; - sg->cpumask = CPU_MASK_NONE; + cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map, NULL) != group) + for_each_cpu_mask(j, *span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, covered); + cpu_set(j, *covered); cpu_set(j, sg->cpumask); } if (!first) @@ -6259,7 +6532,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, * * Should use nodemask_t. */ -static int find_next_best_node(int node, unsigned long *used_nodes) +static int find_next_best_node(int node, nodemask_t *used_nodes) { int i, n, val, min_val, best_node = 0; @@ -6273,7 +6546,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) continue; /* Skip already used nodes */ - if (test_bit(n, used_nodes)) + if (node_isset(n, *used_nodes)) continue; /* Simple min distance search */ @@ -6285,40 +6558,37 @@ static int find_next_best_node(int node, unsigned long *used_nodes) } } - set_bit(best_node, used_nodes); + node_set(best_node, *used_nodes); return best_node; } /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing - * @size: number of nodes to include in this span + * @span: resulting cpumask * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ -static cpumask_t sched_domain_node_span(int node) +static void sched_domain_node_span(int node, cpumask_t *span) { - DECLARE_BITMAP(used_nodes, MAX_NUMNODES); - cpumask_t span, nodemask; + nodemask_t used_nodes; + node_to_cpumask_ptr(nodemask, node); int i; - cpus_clear(span); - bitmap_zero(used_nodes, MAX_NUMNODES); + cpus_clear(*span); + nodes_clear(used_nodes); - nodemask = node_to_cpumask(node); - cpus_or(span, span, nodemask); - set_bit(node, used_nodes); + cpus_or(*span, *span, *nodemask); + node_set(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); + int next_node = find_next_best_node(node, &used_nodes); - nodemask = node_to_cpumask(next_node); - cpus_or(span, span, nodemask); + node_to_cpumask_ptr_next(nodemask, next_node); + cpus_or(*span, *span, *nodemask); } - - return span; } #endif @@ -6332,7 +6602,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); @@ -6350,19 +6621,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); if (sg) *sg = &per_cpu(sched_group_core, group); return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6374,17 +6648,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = cpu_coregroup_map(cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #else group = cpu; #endif @@ -6400,19 +6675,19 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) * gets dynamically allocated. */ static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; +static struct sched_group ***sched_group_nodes_bycpu; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) + struct sched_group **sg, cpumask_t *nodemask) { - cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); int group; - cpus_and(nodemask, nodemask, *cpu_map); - group = first_cpu(nodemask); + *nodemask = node_to_cpumask(cpu_to_node(cpu)); + cpus_and(*nodemask, *nodemask, *cpu_map); + group = first_cpu(*nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group); @@ -6448,7 +6723,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; @@ -6460,11 +6735,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) continue; for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; if (sg == NULL) @@ -6482,7 +6757,7 @@ next_sg: } } #else -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } #endif @@ -6540,13 +6815,111 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) } /* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#define SD_INIT(sd, type) sd_init_##type(sd) +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +/* + * To minimize stack usage kmalloc room for cpumasks and share the + * space as the usage in build_sched_domains() dictates. Used only + * if the amount of space is significant. + */ +struct allmasks { + cpumask_t tmpmask; /* make this one first */ + union { + cpumask_t nodemask; + cpumask_t this_sibling_map; + cpumask_t this_core_map; + }; + cpumask_t send_covered; + +#ifdef CONFIG_NUMA + cpumask_t domainspan; + cpumask_t covered; + cpumask_t notcovered; +#endif +}; + +#if NR_CPUS > 128 +#define SCHED_CPUMASK_ALLOC 1 +#define SCHED_CPUMASK_FREE(v) kfree(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#else +#define SCHED_CPUMASK_ALLOC 0 +#define SCHED_CPUMASK_FREE(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#endif + +#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ + ((unsigned long)(a) + offsetof(struct allmasks, v)) + +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + unsigned long val; + + val = simple_strtoul(str, NULL, 0); + if (val < SD_LV_MAX) + default_relax_domain_level = val; + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + +/* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int build_sched_domains(const cpumask_t *cpu_map) +static int __build_sched_domains(const cpumask_t *cpu_map, + struct sched_domain_attr *attr) { int i; struct root_domain *rd; + SCHED_CPUMASK_DECLARE(allmasks); + cpumask_t *tmpmask; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6560,39 +6933,63 @@ static int build_sched_domains(const cpumask_t *cpu_map) printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + return -ENOMEM; + } + +#if SCHED_CPUMASK_ALLOC + /* get space for all scratch cpumask variables */ + allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + if (!allmasks) { + printk(KERN_WARNING "Cannot alloc cpumask array\n"); + kfree(rd); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif return -ENOMEM; } +#endif + tmpmask = (cpumask_t *)allmasks; + + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + SCHED_CPUMASK_VAR(nodemask, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); + *nodemask = node_to_cpumask(cpu_to_node(i)); + cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); - *sd = SD_ALLNODES_INIT; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; sd = &per_cpu(node_domains, i); - *sd = SD_NODE_INIT; - sd->span = sched_domain_node_span(cpu_to_node(i)); + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), &sd->span); sd->parent = p; if (p) p->child = sd; @@ -6601,94 +6998,117 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - *sd = SD_CPU_INIT; - sd->span = nodemask; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + sd->span = *nodemask; sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups); + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - *sd = SD_MC_INIT; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups); + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - *sd = SD_SIBLING_INIT; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups); + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) + SCHED_CPUMASK_VAR(this_sibling_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_sibling_map = per_cpu(cpu_sibling_map, i); + cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); + if (i != first_cpu(*this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + &cpu_to_cpu_group, + send_covered, tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) + SCHED_CPUMASK_VAR(this_core_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_core_map = cpu_coregroup_map(i); + cpus_and(*this_core_map, *this_core_map, *cpu_map); + if (i != first_cpu(*this_core_map)) continue; + init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + &cpu_to_core_group, + send_covered, tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) { + SCHED_CPUMASK_VAR(send_covered, allmasks); + + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(domainspan, allmasks); + SCHED_CPUMASK_VAR(covered, allmasks); int j; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { + *nodemask = node_to_cpumask(i); + cpus_clear(*covered); + + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) { sched_group_nodes[i] = NULL; continue; } - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + sched_domain_node_span(i, domainspan); + cpus_and(*domainspan, *domainspan, *cpu_map); sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { @@ -6697,31 +7117,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { + for_each_cpu_mask(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = nodemask; + sg->cpumask = *nodemask; sg->next = sg; - cpus_or(covered, covered, nodemask); + cpus_or(*covered, *covered, *nodemask); prev = sg; for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; + SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % MAX_NUMNODES; + node_to_cpumask_ptr(pnodemask, n); - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) + cpus_complement(*notcovered, *covered); + cpus_and(*tmpmask, *notcovered, *cpu_map); + cpus_and(*tmpmask, *tmpmask, *domainspan); + if (cpus_empty(*tmpmask)) break; - nodemask = node_to_cpumask(n); - cpus_and(tmp, tmp, nodemask); - if (cpus_empty(tmp)) + cpus_and(*tmpmask, *tmpmask, *pnodemask); + if (cpus_empty(*tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group), @@ -6732,9 +7152,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sg->__cpu_power = 0; - sg->cpumask = tmp; + sg->cpumask = *tmpmask; sg->next = prev->next; - cpus_or(covered, covered, tmp); + cpus_or(*covered, *covered, *tmpmask); prev->next = sg; prev = sg; } @@ -6770,7 +7190,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -6788,17 +7209,26 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpu_attach_domain(sd, rd, i); } + SCHED_CPUMASK_FREE((void *)allmasks); return 0; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); + SCHED_CPUMASK_FREE((void *)allmasks); return -ENOMEM; #endif } +static int build_sched_domains(const cpumask_t *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -6812,6 +7242,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) } /* + * Free current domain masks. + * Called after all cpus are attached to NULL domain. + */ +static void free_sched_domains(void) +{ + ndoms_cur = 0; + if (doms_cur != &fallback_doms) + kfree(doms_cur); + doms_cur = &fallback_doms; +} + +/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. @@ -6826,15 +7268,17 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map, + cpumask_t *tmpmask) { - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); } /* @@ -6843,6 +7287,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) */ static void detach_destroy_domains(const cpumask_t *cpu_map) { + cpumask_t tmpmask; int i; unregister_sched_domain_sysctl(); @@ -6850,7 +7295,23 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map); + arch_destroy_sched_domains(cpu_map, &tmpmask); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); } /* @@ -6874,11 +7335,12 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new) { int i, j; - lock_doms_cur(); + mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); @@ -6887,12 +7349,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; } /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < ndoms_new; j++) { - if (cpus_equal(doms_cur[i], doms_new[j])) + if (cpus_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ @@ -6904,11 +7368,13 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j])) + if (cpus_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - build_sched_domains(doms_new + i); + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); match2: ; } @@ -6916,12 +7382,14 @@ match2: /* Remember the new sched domains */ if (doms_cur != &fallback_doms) kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; + dattr_cur = dattr_new; ndoms_cur = ndoms_new; register_sched_domain_sysctl(); - unlock_doms_cur(); + mutex_unlock(&sched_domains_mutex); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -6930,8 +7398,11 @@ int arch_reinit_sched_domains(void) int err; get_online_cpus(); + mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); + free_sched_domains(); err = arch_init_sched_domains(&cpu_online_map); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); return err; @@ -7015,6 +7486,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); + free_sched_domains(); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -7033,8 +7505,16 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_DONE; } +#ifndef CONFIG_CPUSETS + /* + * Create default domain partitioning if cpusets are disabled. + * Otherwise we let cpusets rebuild the domains based on the + * current setup. + */ + /* The hotplug lock is already held by cpu_up/cpu_down */ arch_init_sched_domains(&cpu_online_map); +#endif return NOTIFY_OK; } @@ -7043,17 +7523,25 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; +#if defined(CONFIG_NUMA) + sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), + GFP_KERNEL); + BUG_ON(sched_group_nodes_bycpu == NULL); +#endif get_online_cpus(); + mutex_lock(&sched_domains_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) BUG(); sched_init_granularity(); } @@ -7074,6 +7562,7 @@ int in_sched_functions(unsigned long addr) static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif @@ -7103,6 +7592,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -7111,10 +7602,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, - struct cfs_rq *cfs_rq, struct sched_entity *se, - int cpu, int add) +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) { + struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; @@ -7122,45 +7614,130 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; - se->cfs_rq = &rq->cfs; + /* se could be NULL for init_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + se->my_q = cfs_rq; se->load.weight = tg->shares; - se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); - se->parent = NULL; + se->load.inv_weight = 0; + se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, - struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, - int cpu, int add) +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) { + struct rq *rq = cpu_rq(cpu); + tg->rt_rq[cpu] = rt_rq; init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; if (add) list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; - rt_se->rt_rq = &rq->rt; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + rt_se->my_q = rt_rq; - rt_se->parent = NULL; + rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); } #endif void __init sched_init(void) { - int highest_cpu = 0; int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; +#endif + /* + * As sched_init() is called before page_alloc is setup, + * we use alloc_bootmem(). + */ + if (alloc_size) { + ptr = (unsigned long)alloc_bootmem(alloc_size); + +#ifdef CONFIG_FAIR_GROUP_SCHED + init_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif +#endif + } #ifdef CONFIG_SMP init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); + INIT_LIST_HEAD(&init_task_group.children); + +#ifdef CONFIG_USER_SCHED + INIT_LIST_HEAD(&root_task_group.children); + init_task_group.parent = &root_task_group; + list_add(&init_task_group.siblings, &root_task_group.children); +#endif #endif for_each_possible_cpu(i) { @@ -7170,27 +7747,67 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - init_tg_cfs_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ + init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1); + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); #endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); - init_tg_rt_entry(rq, &init_task_group, +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); +#elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); + init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1); + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); +#endif #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7207,7 +7824,6 @@ void __init sched_init(void) #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - highest_cpu = i; } set_load_weight(&init_task); @@ -7217,7 +7833,6 @@ void __init sched_init(void) #endif #ifdef CONFIG_SMP - nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); #endif @@ -7275,6 +7890,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -7306,7 +7922,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* @@ -7376,8 +7991,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7394,17 +8007,18 @@ static void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } -static int alloc_fair_sched_group(struct task_group *tg) +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct sched_entity *se, *parent_se; struct rq *rq; int i; - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->cfs_rq) goto err; - tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); if (!tg->se) goto err; @@ -7423,7 +8037,8 @@ static int alloc_fair_sched_group(struct task_group *tg) if (!se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + parent_se = parent ? parent->se[i] : NULL; + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); } return 1; @@ -7447,7 +8062,8 @@ static inline void free_fair_sched_group(struct task_group *tg) { } -static inline int alloc_fair_sched_group(struct task_group *tg) +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7466,6 +8082,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7477,21 +8095,23 @@ static void free_rt_sched_group(struct task_group *tg) kfree(tg->rt_se); } -static int alloc_rt_sched_group(struct task_group *tg) +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; + struct sched_rt_entity *rt_se, *parent_se; struct rq *rq; int i; - tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_rq) goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7506,7 +8126,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!rt_se) goto err; - init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); + parent_se = parent ? parent->rt_se[i] : NULL; + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); } return 1; @@ -7530,7 +8151,8 @@ static inline void free_rt_sched_group(struct task_group *tg) { } -static inline int alloc_rt_sched_group(struct task_group *tg) +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7544,6 +8166,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7552,7 +8175,7 @@ static void free_sched_group(struct task_group *tg) } /* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; @@ -7562,10 +8185,10 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); - if (!alloc_fair_sched_group(tg)) + if (!alloc_fair_sched_group(tg, parent)) goto err; - if (!alloc_rt_sched_group(tg)) + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); @@ -7574,6 +8197,12 @@ struct task_group *sched_create_group(void) register_rt_sched_group(tg, i); } list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + list_add_rcu(&tg->siblings, &parent->children); + INIT_LIST_HEAD(&tg->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -7602,6 +8231,7 @@ void sched_destroy_group(struct task_group *tg) unregister_rt_sched_group(tg, i); } list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ @@ -7645,6 +8275,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -7660,7 +8291,7 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; - se->load.inv_weight = div64_64((1ULL<<32), shares); + se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); @@ -7676,12 +8307,15 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) unsigned long flags; /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) + * We can't change the weight of the root cgroup. */ - if (shares < 2) - shares = 2; + if (!tg->se[0]) + return -EINVAL; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -7690,6 +8324,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); + list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ @@ -7710,6 +8345,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) register_fair_sched_group(tg, i); + list_add_rcu(&tg->siblings, &tg->parent->children); spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); @@ -7733,29 +8369,61 @@ static unsigned long to_ratio(u64 period, u64 runtime) if (runtime == RUNTIME_INF) return 1ULL << 16; - return div64_64(runtime << 16, period); + return div64_u64(runtime << 16, period); } +#ifdef CONFIG_CGROUP_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct task_group *tgi, *parent = tg ? tg->parent : NULL; + unsigned long total = 0; + + if (!parent) { + if (global_rt_period() < period) + return 0; + + return to_ratio(period, runtime) < + to_ratio(global_rt_period(), global_rt_runtime()); + } + + if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) + return 0; + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &parent->children, siblings) { + if (tgi == tg) + continue; + + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); + } + rcu_read_unlock(); + + return total + to_ratio(period, runtime) < + to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), + parent->rt_bandwidth.rt_runtime); +} +#elif defined CONFIG_USER_SCHED static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); return total + to_ratio(period, runtime) < global_ratio; } +#endif /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) @@ -7768,19 +8436,14 @@ static inline int tg_has_rt_tasks(struct task_group *tg) return 0; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) { - u64 rt_runtime, rt_period; - int err = 0; - - rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; + int i, err = 0; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { err = -EBUSY; goto unlock; } @@ -7788,7 +8451,19 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) err = -EINVAL; goto unlock; } - tg->rt_runtime = rt_runtime; + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -7796,19 +8471,112 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(NULL, 1, 0)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} #endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -7822,7 +8590,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static struct cgroup_subsys_state * cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct task_group *tg; + struct task_group *tg, *parent; if (!cgrp->parent) { /* This is early initialization for the top cgroup */ @@ -7830,11 +8598,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &init_task_group.css; } - /* we support only 1-level deep hierarchical scheduler atm */ - if (cgrp->parent->parent) - return ERR_PTR(-EINVAL); - - tg = sched_create_group(); + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7858,7 +8623,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7877,13 +8642,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { return sched_group_set_shares(cgroup_tg(cgrp), shareval); } -static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); @@ -7893,48 +8658,25 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) + s64 val) { - char buffer[64]; - int retval = 0; - s64 val; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoll(buffer, &end, 0); - if (*end) - return -EINVAL; + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} - /* Pass to subsystem */ - retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); - if (!retval) - retval = nbytes; - return retval; +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); } -static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) { - char tmp[64]; - long val = sched_group_rt_runtime(cgroup_tg(cgrp)); - int len = sprintf(tmp, "%ld\n", val); + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); } #endif @@ -7942,15 +8684,20 @@ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", - .read_uint = cpu_shares_read_uint, - .write_uint = cpu_shares_write_uint, + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, }, #endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", - .read = cpu_rt_runtime_read, - .write = cpu_rt_runtime_write, + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, }, #endif }; @@ -7992,9 +8739,9 @@ struct cpuacct { struct cgroup_subsys cpuacct_subsys; /* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), struct cpuacct, css); } @@ -8007,7 +8754,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cont) + struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -8025,18 +8772,18 @@ static struct cgroup_subsys_state *cpuacct_create( /* destroy an existing cpu accounting group */ static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); free_percpu(ca->cpuusage); kfree(ca); } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cont); + struct cpuacct *ca = cgroup_ca(cgrp); u64 totalcpuusage = 0; int i; @@ -8055,16 +8802,40 @@ static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) return totalcpuusage; } +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_possible_cpu(i) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + + spin_lock_irq(&cpu_rq(i)->lock); + *cpuusage = 0; + spin_unlock_irq(&cpu_rq(i)->lock); + } +out: + return err; +} + static struct cftype files[] = { { .name = "usage", - .read_uint = cpuusage_read, + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, }, }; -static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) { - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); } /* diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c new file mode 100644 index 00000000000..ce05271219a --- /dev/null +++ b/kernel/sched_clock.c @@ -0,0 +1,246 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Based on code by: + * Ingo Molnar <mingo@redhat.com> + * Guillaume Chazarain <guichaz@gmail.com> + * + * Create a semi stable clock from a mixture of other events, including: + * - gtod + * - jiffies + * - sched_clock() + * - explicit idle events + * + * We use gtod as base and the unstable clock deltas. The deltas are filtered, + * making it monotonic and keeping it within an expected window. This window + * is set up using jiffies. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat + * consistent between cpus (never more than 1 jiffies difference). + */ +#include <linux/sched.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/ktime.h> +#include <linux/module.h> + + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + +struct sched_clock_data { + /* + * Raw spinlock - this is a special case: this might be called + * from within instrumentation code so we dont want to do any + * instrumentation ourselves. + */ + raw_spinlock_t lock; + + unsigned long prev_jiffies; + u64 prev_raw; + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +static __read_mostly int sched_clock_running; + +void sched_clock_init(void) +{ + u64 ktime_now = ktime_to_ns(ktime_get()); + unsigned long now_jiffies = jiffies; + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->prev_jiffies = now_jiffies; + scd->prev_raw = 0; + scd->tick_raw = 0; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } + + sched_clock_running = 1; +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use jiffies to generate a min,max window to clip the raw values + */ +static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +{ + unsigned long now_jiffies = jiffies; + long delta_jiffies = now_jiffies - scd->prev_jiffies; + u64 clock = scd->clock; + u64 min_clock, max_clock; + s64 delta = now - scd->prev_raw; + + WARN_ON_ONCE(!irqs_disabled()); + min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + if (unlikely(delta < 0)) { + clock++; + goto out; + } + + max_clock = min_clock + TICK_NSEC; + + if (unlikely(clock + delta > max_clock)) { + if (clock < max_clock) + clock = max_clock; + else + clock++; + } else { + clock += delta; + } + + out: + if (unlikely(clock < min_clock)) + clock = min_clock; + + scd->prev_raw = now; + scd->prev_jiffies = now_jiffies; + scd->clock = clock; +} + +static void lock_double_clock(struct sched_clock_data *data1, + struct sched_clock_data *data2) +{ + if (data1 < data2) { + __raw_spin_lock(&data1->lock); + __raw_spin_lock(&data2->lock); + } else { + __raw_spin_lock(&data2->lock); + __raw_spin_lock(&data1->lock); + } +} + +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + u64 now, clock; + + if (unlikely(!sched_clock_running)) + return 0ull; + + WARN_ON_ONCE(!irqs_disabled()); + now = sched_clock(); + + if (cpu != raw_smp_processor_id()) { + /* + * in order to update a remote cpu's clock based on our + * unstable raw time rebase it against: + * tick_raw (offset between raw counters) + * tick_gotd (tick offset between cpus) + */ + struct sched_clock_data *my_scd = this_scd(); + + lock_double_clock(scd, my_scd); + + now -= my_scd->tick_raw; + now += scd->tick_raw; + + now -= my_scd->tick_gtod; + now += scd->tick_gtod; + + __raw_spin_unlock(&my_scd->lock); + } else { + __raw_spin_lock(&scd->lock); + } + + __update_sched_clock(scd, now); + clock = scd->clock; + + __raw_spin_unlock(&scd->lock); + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd = this_scd(); + u64 now, now_gtod; + + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + now = sched_clock(); + now_gtod = ktime_to_ns(ktime_get()); + + __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now); + /* + * update tick_gtod after __update_sched_clock() because that will + * already observe 1 new jiffy; adding a new tick_gtod to that would + * increase the clock 2 jiffies. + */ + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + __raw_spin_unlock(&scd->lock); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct sched_clock_data *scd = this_scd(); + u64 now = sched_clock(); + + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + __raw_spin_lock(&scd->lock); + scd->prev_raw = now; + scd->clock += delta_ns; + __raw_spin_unlock(&scd->lock); + + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +#endif + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index ef358ba0768..8bb713040ac 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -67,14 +67,24 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld\n", + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", SPLIT_NS(p->se.vruntime), SPLIT_NS(p->se.sum_exec_runtime), SPLIT_NS(p->se.sum_sleep_runtime)); #else - SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld\n", + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif + +#ifdef CONFIG_CGROUP_SCHED + { + char path[64]; + + cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); + SEQ_printf(m, " %s", path); + } +#endif + SEQ_printf(m, "\n"); } static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) @@ -109,7 +119,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; - SEQ_printf(m, "\ncfs_rq\n"); +#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#else + char path[128] = ""; + struct cgroup *cgroup = NULL; + struct task_group *tg = cfs_rq->tg; + + if (tg) + cgroup = tg->css.cgroup; + + if (cgroup) + cgroup_path(cgroup, path, sizeof(path)); + + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -175,13 +199,6 @@ static void print_cpu(struct seq_file *m, int cpu) PN(next_balance); P(curr->pid); PN(clock); - PN(idle_clock); - PN(prev_clock_raw); - P(clock_warps); - P(clock_overflows); - P(clock_underflows); - P(clock_deep_idle_events); - PN(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); P(cpu_load[2]); @@ -214,7 +231,6 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_batch_wakeup_granularity); PN(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN @@ -249,12 +265,9 @@ static int __init init_sched_debug_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("sched_debug", 0644, NULL); + pe = proc_create("sched_debug", 0644, NULL, &sched_debug_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &sched_debug_fops; - return 0; } @@ -332,8 +345,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) avg_per_cpu = p->se.sum_exec_runtime; if (p->se.nr_migrations) { - avg_per_cpu = div64_64(avg_per_cpu, - p->se.nr_migrations); + avg_per_cpu = div64_u64(avg_per_cpu, + p->se.nr_migrations); } else { avg_per_cpu = -1LL; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 86a93376282..08ae848b71d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -62,24 +62,14 @@ const_debug unsigned int sysctl_sched_child_runs_first = 1; unsigned int __read_mostly sysctl_sched_compat_yield; /* - * SCHED_BATCH wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - */ -unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; - -/* * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; +unsigned int sysctl_sched_wakeup_granularity = 10000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -87,6 +77,11 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; * CFS operations on generic schedulable entities: */ +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -98,6 +93,54 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return cfs_rq->tg->cfs_rq[this_cpu]; +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se->cfs_rq == pse->cfs_rq) + return 1; + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -107,13 +150,49 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) #define entity_is_task(se) 1 -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#define for_each_sched_entity(se) \ + for (; se; se = NULL) -static inline struct task_struct *task_of(struct sched_entity *se) +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) { - return container_of(se, struct task_struct, se); + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; } +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -283,29 +362,47 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_mine(__sched_period(cfs_rq->nr_running), - se->load.weight, &cfs_rq->load); + u64 slice = __sched_period(cfs_rq->nr_running); + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + slice *= se->load.weight; + do_div(slice, cfs_rq->load.weight); + } + + + return slice; } /* - * We calculate the vruntime slice. + * We calculate the vruntime slice of a to be inserted task * * vs = s/w = p/rw */ -static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 vslice = __sched_period(nr_running); + unsigned long nr_running = cfs_rq->nr_running; + unsigned long weight; + u64 vslice; - vslice *= NICE_0_LOAD; - do_div(vslice, rq_weight); + if (!se->on_rq) + nr_running++; - return vslice; -} + vslice = __sched_period(nr_running); -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return __sched_vslice(cfs_rq->load.weight + se->load.weight, - cfs_rq->nr_running + 1); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + weight = cfs_rq->load.weight; + if (!se->on_rq) + weight += se->load.weight; + + vslice *= NICE_0_LOAD; + do_div(vslice, weight); + } + + return vslice; } /* @@ -419,6 +516,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -427,6 +525,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -510,10 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) { - vruntime -= calc_delta_fair(sysctl_sched_latency, - &cfs_rq->load); - } + if (sched_feat(NEW_FAIR_SLEEPERS)) + vruntime -= sysctl_sched_latency; /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -529,6 +626,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + account_entity_enqueue(cfs_rq, se); if (wakeup) { place_entity(cfs_rq, se, 0); @@ -539,7 +637,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - account_entity_enqueue(cfs_rq, se); } static void update_avg(u64 *avg, u64 sample) @@ -629,20 +726,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + static struct sched_entity * pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) { - s64 diff, gran; - if (!cfs_rq->next) return se; - diff = cfs_rq->next->vruntime - se->vruntime; - if (diff < 0) - return se; - - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); - if (diff > gran) + if (wakeup_preempt_entity(cfs_rq->next, se) != 0) return se; return cfs_rq->next; @@ -692,8 +785,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ - if (queued) - return resched_task(rq_of(cfs_rq)->curr); + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } /* * don't let the period tick interfere with the hrtick preemption */ @@ -710,101 +805,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * CFS operations on tasks: */ -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* Walk up scheduling entities hierarchy */ -#define for_each_sched_entity(se) \ - for (; se; se = se->parent) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return p->se.cfs_rq; -} - -/* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - return se->cfs_rq; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return grp->my_q; -} - -/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on - * another cpu ('this_cpu') - */ -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return cfs_rq->tg->cfs_rq[this_cpu]; -} - -/* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) - -/* Do the two (enqueued) entities belong to the same group ? */ -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - if (se->cfs_rq == pse->cfs_rq) - return 1; - - return 0; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return se->parent; -} - -#else /* CONFIG_FAIR_GROUP_SCHED */ - -#define for_each_sched_entity(se) \ - for (; se; se = NULL) - -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) -{ - return &task_rq(p)->cfs; -} - -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) -{ - struct task_struct *p = task_of(se); - struct rq *rq = task_rq(p); - - return &rq->cfs; -} - -/* runqueue "owned" by this group */ -static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) -{ - return NULL; -} - -static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) -{ - return &cpu_rq(this_cpu)->cfs; -} - -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) - -static inline int -is_same_group(struct sched_entity *se, struct sched_entity *pse) -{ - return 1; -} - -static inline struct sched_entity *parent_entity(struct sched_entity *se) -{ - return NULL; -} - -#endif /* CONFIG_FAIR_GROUP_SCHED */ - #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -903,7 +903,7 @@ static void yield_task_fair(struct rq *rq) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - __update_rq_clock(rq); + update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ @@ -918,7 +918,7 @@ static void yield_task_fair(struct rq *rq) /* * Already in the rightmost position? */ - if (unlikely(rightmost->vruntime < se->vruntime)) + if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) return; /* @@ -953,11 +953,13 @@ static int wake_idle(int cpu, struct task_struct *p) * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) return cpu; for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_IDLE) { + if ((sd->flags & SD_WAKE_IDLE) + || ((sd->flags & SD_WAKE_IDLE_FAR) + && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) { @@ -994,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *curr = this_rq->curr; unsigned long tl = this_load; unsigned long tl_per_task; + int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE)) + if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + + /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly * woken task: */ - if (sync && curr->sched_class == &fair_sched_class) { + if (sync && balanced && curr->sched_class == &fair_sched_class) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && p->se.avg_overlap < sysctl_sched_migration_cost) return 1; @@ -1012,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { + balanced) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1101,6 +1106,59 @@ out: } #endif /* CONFIG_SMP */ +static unsigned long wakeup_gran(struct sched_entity *se) +{ + unsigned long gran = sysctl_sched_wakeup_granularity; + + /* + * More easily preempt - nice tasks, while not making + * it harder for + nice tasks. + */ + if (unlikely(se->load.weight > NICE_0_LOAD)) + gran = calc_delta_fair(gran, &se->load); + + return gran; +} + +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff < 0) + return -1; + + gran = wakeup_gran(curr); + if (vdiff > gran) + return 1; + + return 0; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} /* * Preempt the current task with a newly woken task if needed: @@ -1110,7 +1168,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - unsigned long gran; + int se_depth, pse_depth; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1135,20 +1193,33 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - while (!is_same_group(se, pse)) { + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(se); + pse_depth = depth_se(pse); + + while (se_depth > pse_depth) { + se_depth--; se = parent_entity(se); + } + + while (pse_depth > se_depth) { + pse_depth--; pse = parent_entity(pse); } - gran = sysctl_sched_wakeup_granularity; - /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. - */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + while (!is_same_group(se, pse)) { + se = parent_entity(se); + pse = parent_entity(pse); + } - if (pse->vruntime + gran < se->vruntime) + if (wakeup_preempt_entity(se, pse) == 1) resched_task(curr); } @@ -1199,15 +1270,27 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * the current task: */ static struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) +__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) { - struct task_struct *p; + struct task_struct *p = NULL; + struct sched_entity *se; - if (!curr) + if (next == &cfs_rq->tasks) return NULL; - p = rb_entry(curr, struct task_struct, se.run_node); - cfs_rq->rb_load_balance_curr = rb_next(curr); + /* Skip over entities that are not tasks */ + do { + se = list_entry(next, struct sched_entity, group_node); + next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); + + if (next == &cfs_rq->tasks) + return NULL; + + cfs_rq->balance_iterator = next; + + if (entity_is_task(se)) + p = task_of(se); return p; } @@ -1216,14 +1299,14 @@ static struct task_struct *load_balance_start_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); + return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); } static struct task_struct *load_balance_next_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); + return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1467,9 +1550,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; -#ifdef CONFIG_FAIR_GROUP_SCHED - print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); -#endif rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 00000000000..1c7283cb958 --- /dev/null +++ b/kernel/sched_features.h @@ -0,0 +1,10 @@ +SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(WAKEUP_PREEMPT, 1) +SCHED_FEAT(START_DEBIT, 1) +SCHED_FEAT(AFFINE_WAKEUPS, 1) +SCHED_FEAT(CACHE_HOT_BUDDY, 1) +SCHED_FEAT(SYNC_WAKEUPS, 1) +SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(NORMALIZED_SLEEPER, 1) +SCHED_FEAT(DEADLINE, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 2bcafa37563..3a4f92dbbe6 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -99,7 +99,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class = { +static const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e51642..0f3c19197fa 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) if (!rt_rq->tg) return RUNTIME_INF; - return rt_rq->tg->rt_runtime; + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -127,14 +132,39 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } +#ifdef CONFIG_SMP +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &rt_rq->tg->rt_bandwidth; +} + #else static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - if (sysctl_sched_rt_runtime == -1) - return RUNTIME_INF; + return rt_rq->rt_runtime; +} - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(def_rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -173,6 +203,103 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) { return rt_rq->rt_throttled; } + +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &def_rt_bandwidth; +} + +#endif + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + spin_lock(&rt_rq->rt_runtime_lock); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + } else if (rt_rq->rt_nr_running) + idle = 0; + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + +#ifdef CONFIG_SMP +static int balance_runtime(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int i, weight, more = 0; + u64 rt_period; + + weight = cpus_weight(rd->span); + + spin_lock(&rt_b->rt_runtime_lock); + rt_period = ktime_to_ns(rt_b->rt_period); + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + diff = iter->rt_runtime - iter->rt_time; + if (diff > 0) { + do_div(diff, weight); + if (rt_rq->rt_runtime + diff > rt_period) + diff = rt_period - rt_rq->rt_runtime; + iter->rt_runtime -= diff; + rt_rq->rt_runtime += diff; + more = 1; + if (rt_rq->rt_runtime == rt_period) { + spin_unlock(&iter->rt_runtime_lock); + break; + } + } + spin_unlock(&iter->rt_runtime_lock); + } + spin_unlock(&rt_b->rt_runtime_lock); + + return more; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -197,12 +324,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); + if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + return 0; + +#ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - struct rq *rq = rq_of_rt_rq(rt_rq); + int more; - rq->rt_throttled = 1; - rt_rq->rt_throttled = 1; + spin_unlock(&rt_rq->rt_runtime_lock); + more = balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + + if (more) + runtime = sched_rt_runtime(rt_rq); + } +#endif + if (rt_rq->rt_time > runtime) { + rt_rq->rt_throttled = 1; if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -212,29 +351,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -static void update_sched_rt_period(struct rq *rq) -{ - struct rt_rq *rt_rq; - u64 period; - - while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rq->rt_period_expire += period; - - for_each_leaf_rt_rq(rt_rq, rq) { - u64 runtime = sched_rt_runtime(rt_rq); - - rt_rq->rt_time -= min(rt_rq->rt_time, runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - sched_rt_rq_enqueue(rt_rq); - } - } - - rq->rt_throttled = 0; - } -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -259,9 +375,15 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); + } } static inline @@ -284,6 +406,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +#else + start_rt_bandwidth(&def_rt_bandwidth); #endif } @@ -323,13 +450,19 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #endif } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq && rt_rq_throttled(group_rq)) + /* + * Don't enqueue the group if its throttled, or when empty. + * The latter is a consequence of the former when a child group + * get throttled and the current group doesn't have any other + * active members. + */ + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); @@ -338,7 +471,7 @@ static void enqueue_rt_entity(struct sched_rt_entity *rt_se) inc_rt_tasks(rt_se, rt_rq); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -353,27 +486,39 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) /* * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. - * - * XXX: O(1/2 h^2) because we can only walk up, not down the chain. - * doesn't matter much for now, as h=2 for GROUP_SCHED. */ -static void dequeue_rt_stack(struct task_struct *p) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) { - struct sched_rt_entity *rt_se, *top_se; + struct sched_rt_entity *back = NULL; - /* - * dequeue all, top - down. - */ - do { - rt_se = &p->rt; - top_se = NULL; - for_each_sched_rt_entity(rt_se) { - if (on_rt_rq(rt_se)) - top_se = rt_se; - } - if (top_se) - dequeue_rt_entity(top_se); - } while (top_se); + for_each_sched_rt_entity(rt_se) { + rt_se->back = back; + back = rt_se; + } + + for (rt_se = back; rt_se; rt_se = rt_se->back) { + if (on_rt_rq(rt_se)) + __dequeue_rt_entity(rt_se); + } +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) + __enqueue_rt_entity(rt_se); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + + for_each_sched_rt_entity(rt_se) { + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) + __enqueue_rt_entity(rt_se); + } } /* @@ -386,32 +531,15 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) if (wakeup) rt_se->timeout = 0; - dequeue_rt_stack(p); - - /* - * enqueue everybody, bottom - up. - */ - for_each_sched_rt_entity(rt_se) - enqueue_rt_entity(rt_se); + enqueue_rt_entity(rt_se); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) { struct sched_rt_entity *rt_se = &p->rt; - struct rt_rq *rt_rq; update_curr_rt(rq); - - dequeue_rt_stack(p); - - /* - * re-enqueue all non-empty rt_rq entities. - */ - for_each_sched_rt_entity(rt_se) { - rt_rq = group_rt_rq(rt_se); - if (rt_rq && rt_rq->rt_nr_running) - enqueue_rt_entity(rt_se); - } + dequeue_rt_entity(rt_se); } /* @@ -422,8 +550,10 @@ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) { struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); - list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); + if (on_rt_rq(rt_se)) + list_move_tail(&rt_se->run_list, queue); } static void requeue_task_rt(struct rq *rq, struct task_struct *p) @@ -974,11 +1104,14 @@ static void post_schedule_rt(struct rq *rq) } } - +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ static void task_wake_up_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && - (p->prio >= rq->rt.highest_prio) && + !test_tsk_need_resched(rq->curr) && rq->rt.overloaded) push_rt_tasks(rq); } @@ -1001,7 +1134,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) +static void set_cpus_allowed_rt(struct task_struct *p, + const cpumask_t *new_mask) { int weight = cpus_weight(*new_mask); @@ -1184,7 +1318,7 @@ static void set_curr_task_rt(struct rq *rq) p->se.exec_start = rq->clock; } -const struct sched_class rt_sched_class = { +static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 5b32433e7ee..80179ef7450 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,6 +9,11 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; + int mask_len = NR_CPUS/32 * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); seq_printf(seq, "timestamp %lu\n", jiffies); @@ -36,9 +41,8 @@ static int show_schedstat(struct seq_file *seq, void *v) preempt_disable(); for_each_domain(cpu, sd) { enum cpu_idle_type itype; - char mask_str[NR_CPUS]; - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); + cpumask_scnprintf(mask_str, mask_len, sd->span); seq_printf(seq, "domain%d %s", dcount++, mask_str); for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; itype++) { @@ -63,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v) preempt_enable(); #endif } + kfree(mask_str); return 0; } @@ -193,6 +198,9 @@ static inline void sched_info_queued(struct task_struct *t) /* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { @@ -201,6 +209,9 @@ static inline void sched_info_depart(struct task_struct *t) t->sched_info.cpu_time += delta; rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); } /* diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 00000000000..5c2942e768c --- /dev/null +++ b/kernel/semaphore.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox <willy@linux.intel.com> + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/spinlock.h> + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_interruptible(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_killable(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, jiffies); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + int up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct task_struct *task = current; + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = task; + waiter.up = 0; + + for (;;) { + if (state == TASK_INTERRUPTIBLE && signal_pending(task)) + goto interrupted; + if (state == TASK_KILLABLE && fatal_signal_pending(task)) + goto interrupted; + if (timeout <= 0) + goto timed_out; + __set_task_state(task, state); + spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; + + interrupted: + list_del(&waiter.list); + return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = 1; + wake_up_process(waiter->task); +} diff --git a/kernel/signal.c b/kernel/signal.c index 6af1210092c..6c0958e52ea 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -39,11 +39,19 @@ static struct kmem_cache *sigqueue_cachep; +static int __sig_ignored(struct task_struct *t, int sig) +{ + void __user *handler; + + /* Is it explicitly or implicitly ignored? */ + + handler = t->sighand->action[sig - 1].sa.sa_handler; + return handler == SIG_IGN || + (handler == SIG_DFL && sig_kernel_ignore(sig)); +} static int sig_ignored(struct task_struct *t, int sig) { - void __user * handler; - /* * Tracers always want to know about signals.. */ @@ -58,10 +66,7 @@ static int sig_ignored(struct task_struct *t, int sig) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - /* Is it explicitly or implicitly ignored? */ - handler = t->sighand->action[sig-1].sa.sa_handler; - return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); + return __sig_ignored(t, sig); } /* @@ -220,12 +225,46 @@ void flush_signals(struct task_struct *t) unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t,TIF_SIGPENDING); + clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } +static void __flush_itimer_signals(struct sigpending *pending) +{ + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); +} + +void flush_itimer_signals(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +} + void ignore_signals(struct task_struct *t) { int i; @@ -372,7 +411,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { - int signr = 0; + int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them @@ -405,8 +444,12 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) } } } + recalc_sigpending(); - if (signr && unlikely(sig_kernel_stop(signr))) { + if (!signr) + return 0; + + if (unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending @@ -422,9 +465,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if (signr && - ((info->si_code & __SI_MASK) == __SI_TIMER) && - info->si_sys_private){ + if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave @@ -526,21 +567,34 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s) static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - int error = -EINVAL; + struct pid *sid; + int error; + if (!valid_signal(sig)) - return error; + return -EINVAL; - if (info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) { - error = audit_signal_info(sig, t); /* Let audit system see the signal */ - if (error) - return error; - error = -EPERM; - if (((sig != SIGCONT) || - (task_session_nr(current) != task_session_nr(t))) - && (current->euid ^ t->suid) && (current->euid ^ t->uid) - && (current->uid ^ t->suid) && (current->uid ^ t->uid) - && !capable(CAP_KILL)) + if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) + return 0; + + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) return error; + + if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && + (current->uid ^ t->suid) && (current->uid ^ t->uid) && + !capable(CAP_KILL)) { + switch (sig) { + case SIGCONT: + sid = task_session(t); + /* + * We don't return the error if sid == NULL. The + * task was unhashed, the caller must notice this. + */ + if (!sid || sid == task_session(current)) + break; + default: + return -EPERM; + } } return security_task_kill(t, info, sig, 0); @@ -550,62 +604,44 @@ static int check_kill_permission(int sig, struct siginfo *info, static void do_notify_parent_cldstop(struct task_struct *tsk, int why); /* - * Handle magic process-wide effects of stop/continue signals. - * Unlike the signal actions, these happen immediately at signal-generation + * Handle magic process-wide effects of stop/continue signals. Unlike + * the signal actions, these happen immediately at signal-generation * time regardless of blocking, ignoring, or handling. This does the * actual continuing for SIGCONT, but not the actual stopping for stop - * signals. The process stop is done as a signal action for SIG_DFL. + * signals. The process stop is done as a signal action for SIG_DFL. + * + * Returns true if the signal should be actually delivered, otherwise + * it should be dropped. */ -static void handle_stop_signal(int sig, struct task_struct *p) +static int prepare_signal(int sig, struct task_struct *p) { + struct signal_struct *signal = p->signal; struct task_struct *t; - if (p->signal->flags & SIGNAL_GROUP_EXIT) + if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { /* - * The process is in the middle of dying already. + * The process is in the middle of dying, nothing to do. */ - return; - - if (sig_kernel_stop(sig)) { + } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ - rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); + rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); t = p; do { rm_from_queue(sigmask(SIGCONT), &t->pending); - t = next_thread(t); - } while (t != p); + } while_each_thread(p, t); } else if (sig == SIGCONT) { + unsigned int why; /* * Remove all stop signals from all queues, * and wake all threads. */ - if (unlikely(p->signal->group_stop_count > 0)) { - /* - * There was a group stop in progress. We'll - * pretend it finished before we got here. We are - * obliged to report it to the parent: if the - * SIGSTOP happened "after" this SIGCONT, then it - * would have cleared this pending SIGCONT. If it - * happened "before" this SIGCONT, then the parent - * got the SIGCHLD about the stop finishing before - * the continue happened. We do the notification - * now, and it's as if the stop had finished and - * the SIGCHLD was pending on entry to this kill. - */ - p->signal->group_stop_count = 0; - p->signal->flags = SIGNAL_STOP_CONTINUED; - spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, CLD_STOPPED); - spin_lock(&p->sighand->siglock); - } - rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); + rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { unsigned int state; rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - /* * If there is a handler for SIGCONT, we must make * sure that no thread returns to user mode before @@ -615,7 +651,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) * running the handler. With the TIF_SIGPENDING * flag set, the thread will pause and acquire the * siglock that we hold now and until we've queued - * the pending signal. + * the pending signal. * * Wake up the stopped thread _after_ setting * TIF_SIGPENDING @@ -626,49 +662,163 @@ static void handle_stop_signal(int sig, struct task_struct *p) state |= TASK_INTERRUPTIBLE; } wake_up_state(t, state); + } while_each_thread(p, t); - t = next_thread(t); - } while (t != p); + /* + * Notify the parent with CLD_CONTINUED if we were stopped. + * + * If we were in the middle of a group stop, we pretend it + * was already finished, and then continued. Since SIGCHLD + * doesn't queue we report only CLD_STOPPED, as if the next + * CLD_CONTINUED was dropped. + */ + why = 0; + if (signal->flags & SIGNAL_STOP_STOPPED) + why |= SIGNAL_CLD_CONTINUED; + else if (signal->group_stop_count) + why |= SIGNAL_CLD_STOPPED; - if (p->signal->flags & SIGNAL_STOP_STOPPED) { + if (why) { /* - * We were in fact stopped, and are now continued. - * Notify the parent with CLD_CONTINUED. + * The first thread which returns from finish_stop() + * will take ->siglock, notice SIGNAL_CLD_MASK, and + * notify its parent. See get_signal_to_deliver(). */ - p->signal->flags = SIGNAL_STOP_CONTINUED; - p->signal->group_exit_code = 0; - spin_unlock(&p->sighand->siglock); - do_notify_parent_cldstop(p, CLD_CONTINUED); - spin_lock(&p->sighand->siglock); + signal->flags = why | SIGNAL_STOP_CONTINUED; + signal->group_stop_count = 0; + signal->group_exit_code = 0; } else { /* * We are not stopped, but there could be a stop * signal in the middle of being processed after * being removed from the queue. Clear that too. */ - p->signal->flags = 0; + signal->flags &= ~SIGNAL_STOP_DEQUEUED; + } + } + + return !sig_ignored(p, sig); +} + +/* + * Test if P wants to take SIG. After we've checked all threads with this, + * it's equivalent to finding no threads not blocking SIG. Any threads not + * blocking SIG were ruled out because they are not running and already + * have pending signals. Such threads will dequeue from the shared queue + * as soon as they're available, so putting the signal on the shared queue + * will be equivalent to sending it to one such thread. + */ +static inline int wants_signal(int sig, struct task_struct *p) +{ + if (sigismember(&p->blocked, sig)) + return 0; + if (p->flags & PF_EXITING) + return 0; + if (sig == SIGKILL) + return 1; + if (task_is_stopped_or_traced(p)) + return 0; + return task_curr(p) || !signal_pending(p); +} + +static void complete_signal(int sig, struct task_struct *p, int group) +{ + struct signal_struct *signal = p->signal; + struct task_struct *t; + + /* + * Now find a thread we can wake up to take the signal off the queue. + * + * If the main thread wants the signal, it gets first crack. + * Probably the least surprising to the average bear. + */ + if (wants_signal(sig, p)) + t = p; + else if (!group || thread_group_empty(p)) + /* + * There is just one thread and it does not need to be woken. + * It will dequeue unblocked signals before it runs again. + */ + return; + else { + /* + * Otherwise try to find a suitable thread. + */ + t = signal->curr_target; + while (!wants_signal(sig, t)) { + t = next_thread(t); + if (t == signal->curr_target) + /* + * No thread needs to be woken. + * Any eligible threads will see + * the signal in the queue soon. + */ + return; } - } else if (sig == SIGKILL) { + signal->curr_target = t; + } + + /* + * Found a killable thread. If the signal will be fatal, + * then start taking the whole group down immediately. + */ + if (sig_fatal(p, sig) && + !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !sigismember(&t->real_blocked, sig) && + (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { /* - * Make sure that any pending stop signal already dequeued - * is undone by the wakeup for SIGKILL. + * This signal will be fatal to the whole group. */ - p->signal->flags = 0; + if (!sig_kernel_coredump(sig)) { + /* + * Start a group exit and wake everybody up. + * This way we don't have other threads + * running and doing things after a slower + * thread has the fatal signal pending. + */ + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = sig; + signal->group_stop_count = 0; + t = p; + do { + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + } while_each_thread(p, t); + return; + } } + + /* + * The signal is already in the shared-pending queue. + * Tell the chosen thread to wake up and dequeue it. + */ + signal_wake_up(t, sig == SIGKILL); + return; +} + +static inline int legacy_queue(struct sigpending *signals, int sig) +{ + return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - struct sigpending *signals) + int group) { - struct sigqueue * q = NULL; - int ret = 0; + struct sigpending *pending; + struct sigqueue *q; + + assert_spin_locked(&t->sighand->siglock); + if (!prepare_signal(sig, t)) + return 0; + pending = group ? &t->signal->shared_pending : &t->pending; /* - * Deliver the signal to listening signalfds. This must be called - * with the sighand lock held. + * Short-circuit ignored signals and support queuing + * exactly one non-rt signal, so that we can get more + * detailed information about the cause of the signal. */ - signalfd_notify(t, sig); - + if (legacy_queue(pending, sig)) + return 0; /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -688,7 +838,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, (is_si_special(info) || info->si_code >= 0))); if (q) { - list_add_tail(&q->list, &signals->list); + list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: q->info.si_signo = sig; @@ -718,13 +868,12 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, } out_set: - sigaddset(&signals->signal, sig); - return ret; + signalfd_notify(t, sig); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, group); + return 0; } -#define LEGACY_QUEUE(sigptr, sig) \ - (((sig) < SIGRTMIN) && sigismember(&(sigptr)->signal, (sig))) - int print_fatal_signals; static void print_fatal_signal(struct pt_regs *regs, int signr) @@ -757,29 +906,16 @@ static int __init setup_print_fatal_signals(char *str) __setup("print-fatal-signals=", setup_print_fatal_signals); +int +__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + return send_signal(sig, info, p, 1); +} + static int specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) { - int ret = 0; - - BUG_ON(!irqs_disabled()); - assert_spin_locked(&t->sighand->siglock); - - /* Short-circuit ignored signals. */ - if (sig_ignored(t, sig)) - goto out; - - /* Support queueing exactly one non-rt signal, so that we - can get more detailed information about the cause of - the signal. */ - if (LEGACY_QUEUE(&t->pending, sig)) - goto out; - - ret = send_signal(sig, info, t, &t->pending); - if (!ret && !sigismember(&t->blocked, sig)) - signal_wake_up(t, sig == SIGKILL); -out: - return ret; + return send_signal(sig, info, t, 0); } /* @@ -790,7 +926,8 @@ out: * since we do not want to have a signal handler that was blocked * be invoked when user space had explicitly blocked it. * - * We don't want to have recursive SIGSEGV's etc, for example. + * We don't want to have recursive SIGSEGV's etc, for example, + * that is why we also clear SIGNAL_UNKILLABLE. */ int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) @@ -810,6 +947,8 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } + if (action->sa.sa_handler == SIG_DFL) + t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); @@ -823,134 +962,6 @@ force_sig_specific(int sig, struct task_struct *t) } /* - * Test if P wants to take SIG. After we've checked all threads with this, - * it's equivalent to finding no threads not blocking SIG. Any threads not - * blocking SIG were ruled out because they are not running and already - * have pending signals. Such threads will dequeue from the shared queue - * as soon as they're available, so putting the signal on the shared queue - * will be equivalent to sending it to one such thread. - */ -static inline int wants_signal(int sig, struct task_struct *p) -{ - if (sigismember(&p->blocked, sig)) - return 0; - if (p->flags & PF_EXITING) - return 0; - if (sig == SIGKILL) - return 1; - if (task_is_stopped_or_traced(p)) - return 0; - return task_curr(p) || !signal_pending(p); -} - -static void -__group_complete_signal(int sig, struct task_struct *p) -{ - struct task_struct *t; - - /* - * Now find a thread we can wake up to take the signal off the queue. - * - * If the main thread wants the signal, it gets first crack. - * Probably the least surprising to the average bear. - */ - if (wants_signal(sig, p)) - t = p; - else if (thread_group_empty(p)) - /* - * There is just one thread and it does not need to be woken. - * It will dequeue unblocked signals before it runs again. - */ - return; - else { - /* - * Otherwise try to find a suitable thread. - */ - t = p->signal->curr_target; - if (t == NULL) - /* restart balancing at this thread */ - t = p->signal->curr_target = p; - - while (!wants_signal(sig, t)) { - t = next_thread(t); - if (t == p->signal->curr_target) - /* - * No thread needs to be woken. - * Any eligible threads will see - * the signal in the queue soon. - */ - return; - } - p->signal->curr_target = t; - } - - /* - * Found a killable thread. If the signal will be fatal, - * then start taking the whole group down immediately. - */ - if (sig_fatal(p, sig) && !(p->signal->flags & SIGNAL_GROUP_EXIT) && - !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { - /* - * This signal will be fatal to the whole group. - */ - if (!sig_kernel_coredump(sig)) { - /* - * Start a group exit and wake everybody up. - * This way we don't have other threads - * running and doing things after a slower - * thread has the fatal signal pending. - */ - p->signal->flags = SIGNAL_GROUP_EXIT; - p->signal->group_exit_code = sig; - p->signal->group_stop_count = 0; - t = p; - do { - sigaddset(&t->pending.signal, SIGKILL); - signal_wake_up(t, 1); - } while_each_thread(p, t); - return; - } - } - - /* - * The signal is already in the shared-pending queue. - * Tell the chosen thread to wake up and dequeue it. - */ - signal_wake_up(t, sig == SIGKILL); - return; -} - -int -__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) -{ - int ret = 0; - - assert_spin_locked(&p->sighand->siglock); - handle_stop_signal(sig, p); - - /* Short-circuit ignored signals. */ - if (sig_ignored(p, sig)) - return ret; - - if (LEGACY_QUEUE(&p->signal->shared_pending, sig)) - /* This is a non-RT signal and we already have one queued. */ - return ret; - - /* - * Put this signal on the shared-pending queue, or fail with EAGAIN. - * We always use the shared queue for process-wide signals, - * to avoid several races. - */ - ret = send_signal(sig, info, p, &p->signal->shared_pending); - if (unlikely(ret)) - return ret; - - __group_complete_signal(sig, p); - return 0; -} - -/* * Nuke all other threads in the group. */ void zap_other_threads(struct task_struct *p) @@ -978,13 +989,11 @@ int __fatal_signal_pending(struct task_struct *tsk) } EXPORT_SYMBOL(__fatal_signal_pending); -/* - * Must be called under rcu_read_lock() or with tasklist_lock read-held. - */ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; + rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) @@ -995,6 +1004,7 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long break; spin_unlock_irqrestore(&sighand->siglock, *flags); } + rcu_read_unlock(); return sighand; } @@ -1043,9 +1053,6 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) struct task_struct *p; rcu_read_lock(); - if (unlikely(sig_needs_tasklist(sig))) - read_lock(&tasklist_lock); - retry: p = pid_task(pid, PIDTYPE_PID); if (p) { @@ -1059,10 +1066,8 @@ retry: */ goto retry; } - - if (unlikely(sig_needs_tasklist(sig))) - read_unlock(&tasklist_lock); rcu_read_unlock(); + return error; } @@ -1159,8 +1164,7 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) */ /* - * These two are the most common entry points. They send a signal - * just to the specific thread. + * The caller must ensure the task can't exit. */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) @@ -1175,17 +1179,9 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - /* - * We need the tasklist lock even for the specific - * thread case (when we don't need to follow the group - * lists) in order to avoid races with "p->sighand" - * going away or changing from under us. - */ - read_lock(&tasklist_lock); spin_lock_irqsave(&p->sighand->siglock, flags); ret = specific_send_sig_info(sig, info, p); spin_unlock_irqrestore(&p->sighand->siglock, flags); - read_unlock(&tasklist_lock); return ret; } @@ -1278,121 +1274,60 @@ void sigqueue_free(struct sigqueue *q) BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); /* - * If the signal is still pending remove it from the - * pending queue. We must hold ->siglock while testing - * q->list to serialize with collect_signal(). + * We must hold ->siglock while testing q->list + * to serialize with collect_signal() or with + * __exit_signal()->flush_sigqueue(). */ spin_lock_irqsave(lock, flags); - if (!list_empty(&q->list)) - list_del_init(&q->list); - spin_unlock_irqrestore(lock, flags); - q->flags &= ~SIGQUEUE_PREALLOC; - __sigqueue_free(q); -} - -int send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - /* - * The rcu based delayed sighand destroy makes it possible to - * run this without tasklist lock held. The task struct itself - * cannot go away as create_timer did get_task_struct(). - * - * We return -1, when the task is marked exiting, so - * posix_timer_event can redirect it to the group leader + * If it is queued it will be freed when dequeued, + * like the "regular" sigqueue. */ - rcu_read_lock(); - - if (!likely(lock_task_sighand(p, &flags))) { - ret = -1; - goto out_err; - } - - if (unlikely(!list_empty(&q->list))) { - /* - * If an SI_TIMER entry is already queue just increment - * the overrun count. - */ - BUG_ON(q->info.si_code != SI_TIMER); - q->info.si_overrun++; - goto out; - } - /* Short-circuit ignored signals. */ - if (sig_ignored(p, sig)) { - ret = 1; - goto out; - } - /* - * Deliver the signal to listening signalfds. This must be called - * with the sighand lock held. - */ - signalfd_notify(p, sig); - - list_add_tail(&q->list, &p->pending.list); - sigaddset(&p->pending.signal, sig); - if (!sigismember(&p->blocked, sig)) - signal_wake_up(p, sig == SIGKILL); - -out: - unlock_task_sighand(p, &flags); -out_err: - rcu_read_unlock(); + if (!list_empty(&q->list)) + q = NULL; + spin_unlock_irqrestore(lock, flags); - return ret; + if (q) + __sigqueue_free(q); } -int -send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) +int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) { + int sig = q->info.si_signo; + struct sigpending *pending; unsigned long flags; - int ret = 0; + int ret; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); - /* Since it_lock is held, p->sighand cannot be NULL. */ - spin_lock_irqsave(&p->sighand->siglock, flags); - handle_stop_signal(sig, p); + ret = -1; + if (!likely(lock_task_sighand(t, &flags))) + goto ret; - /* Short-circuit ignored signals. */ - if (sig_ignored(p, sig)) { - ret = 1; + ret = 1; /* the signal is ignored */ + if (!prepare_signal(sig, t)) goto out; - } + ret = 0; if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment - * the overrun count. Other uses should not try to - * send the signal multiple times. + * the overrun count. */ BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; goto out; - } - /* - * Deliver the signal to listening signalfds. This must be called - * with the sighand lock held. - */ - signalfd_notify(p, sig); - - /* - * Put this signal on the shared-pending queue. - * We always use the shared queue for process-wide signals, - * to avoid several races. - */ - list_add_tail(&q->list, &p->signal->shared_pending.list); - sigaddset(&p->signal->shared_pending.signal, sig); + } - __group_complete_signal(sig, p); + signalfd_notify(t, sig); + pending = group ? &t->signal->shared_pending : &t->pending; + list_add_tail(&q->list, &pending->list); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, group); out: - spin_unlock_irqrestore(&p->sighand->siglock, flags); - read_unlock(&tasklist_lock); + unlock_task_sighand(t, &flags); +ret: return ret; } @@ -1723,8 +1658,9 @@ static int do_signal_stop(int signr) } else { struct task_struct *t; - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || - unlikely(sig->group_exit_task)) + if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) + != SIGNAL_STOP_DEQUEUED) || + unlikely(signal_group_exit(sig))) return 0; /* * There is no group stop already in progress. @@ -1757,11 +1693,51 @@ static int do_signal_stop(int signr) return 1; } +static int ptrace_signal(int signr, siginfo_t *info, + struct pt_regs *regs, void *cookie) +{ + if (!(current->ptrace & PT_PTRACED)) + return signr; + + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ + ptrace_stop(signr, 0, info); + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + return signr; + + current->exit_code = 0; + + /* Update the siginfo structure if the signal has + changed. If the debugger wanted something + specific in the siginfo structure then it should + have updated *info via PTRACE_SETSIGINFO. */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = current->parent->uid; + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + specific_send_sig_info(signr, info, current); + signr = 0; + } + + return signr; +} + int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie) { - sigset_t *mask = ¤t->blocked; - int signr = 0; + struct sighand_struct *sighand = current->sighand; + struct signal_struct *signal = current->signal; + int signr; relock: /* @@ -1772,52 +1748,42 @@ relock: */ try_to_freeze(); - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&sighand->siglock); + /* + * Every stopped thread goes here after wakeup. Check to see if + * we should notify the parent, prepare_signal(SIGCONT) encodes + * the CLD_ si_code into SIGNAL_CLD_MASK bits. + */ + if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { + int why = (signal->flags & SIGNAL_STOP_CONTINUED) + ? CLD_CONTINUED : CLD_STOPPED; + signal->flags &= ~SIGNAL_CLD_MASK; + spin_unlock_irq(&sighand->siglock); + + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + goto relock; + } + for (;;) { struct k_sigaction *ka; - if (unlikely(current->signal->group_stop_count > 0) && + if (unlikely(signal->group_stop_count > 0) && do_signal_stop(0)) goto relock; - signr = dequeue_signal(current, mask, info); - + signr = dequeue_signal(current, ¤t->blocked, info); if (!signr) break; /* will return 0 */ - if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { - ptrace_signal_deliver(regs, cookie); - - /* Let the debugger run. */ - ptrace_stop(signr, 0, info); - - /* We're back. Did the debugger cancel the sig? */ - signr = current->exit_code; - if (signr == 0) - continue; - - current->exit_code = 0; - - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ - if (signr != info->si_signo) { - info->si_signo = signr; - info->si_errno = 0; - info->si_code = SI_USER; - info->si_pid = task_pid_vnr(current->parent); - info->si_uid = current->parent->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - specific_send_sig_info(signr, info, current); + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, regs, cookie); + if (!signr) continue; - } } - ka = ¤t->sighand->action[signr-1]; + ka = &sighand->action[signr-1]; if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1839,7 +1805,8 @@ relock: /* * Global init gets no signals it doesn't want. */ - if (is_global_init(current)) + if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && + !signal_group_exit(signal)) continue; if (sig_kernel_stop(signr)) { @@ -1854,14 +1821,14 @@ relock: * We need to check for that and bail out if necessary. */ if (signr != SIGSTOP) { - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&sighand->siglock); /* signals can be posted during this window */ if (is_current_pgrp_orphaned()) goto relock; - spin_lock_irq(¤t->sighand->siglock); + spin_lock_irq(&sighand->siglock); } if (likely(do_signal_stop(signr))) { @@ -1876,15 +1843,16 @@ relock: continue; } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&sighand->siglock); /* * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; - if ((signr != SIGKILL) && print_fatal_signals) - print_fatal_signal(regs, signr); + if (sig_kernel_coredump(signr)) { + if (print_fatal_signals) + print_fatal_signal(regs, signr); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with @@ -1902,7 +1870,7 @@ relock: do_group_exit(signr); /* NOTREACHED */ } - spin_unlock_irq(¤t->sighand->siglock); + spin_unlock_irq(&sighand->siglock); return signr; } @@ -2246,6 +2214,7 @@ static int do_tkill(int tgid, int pid, int sig) int error; struct siginfo info; struct task_struct *p; + unsigned long flags; error = -ESRCH; info.si_signo = sig; @@ -2254,22 +2223,24 @@ static int do_tkill(int tgid, int pid, int sig) info.si_pid = task_tgid_vnr(current); info.si_uid = current->uid; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, &info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. + * + * If lock_task_sighand() fails we pretend the task dies + * after receiving the signal. The window is tiny, and the + * signal is private anyway. */ - if (!error && sig && p->sighand) { - spin_lock_irq(&p->sighand->siglock); - handle_stop_signal(sig, p); + if (!error && sig && lock_task_sighand(p, &flags)) { error = specific_send_sig_info(sig, &info, p); - spin_unlock_irq(&p->sighand->siglock); + unlock_task_sighand(p, &flags); } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return error; } @@ -2326,13 +2297,14 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { + struct task_struct *t = current; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; - k = ¤t->sighand->action[sig-1]; + k = &t->sighand->action[sig-1]; spin_lock_irq(¤t->sighand->siglock); if (oact) @@ -2353,9 +2325,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (act->sa.sa_handler == SIG_IGN || - (act->sa.sa_handler == SIG_DFL && sig_kernel_ignore(sig))) { - struct task_struct *t = current; + if (__sig_ignored(t, sig)) { sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2610,7 +2580,7 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) current->state = TASK_INTERRUPTIBLE; schedule(); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); return -ERESTARTNOHAND; } #endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 31e9f2a4792..36e06174004 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -356,7 +356,8 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) /* Tasklets */ struct tasklet_head { - struct tasklet_struct *list; + struct tasklet_struct *head; + struct tasklet_struct **tail; }; /* Some compilers disobey section attribute on statics when not @@ -369,8 +370,9 @@ void __tasklet_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_vec).tail = t; + __get_cpu_var(tasklet_vec).tail = &(t->next); raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -382,8 +384,9 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_hi_vec).tail = t; + __get_cpu_var(tasklet_hi_vec).tail = &(t->next); raise_softirq_irqoff(HI_SOFTIRQ); local_irq_restore(flags); } @@ -395,8 +398,9 @@ static void tasklet_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = NULL; + list = __get_cpu_var(tasklet_vec).head; + __get_cpu_var(tasklet_vec).head = NULL; + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; local_irq_enable(); while (list) { @@ -416,8 +420,9 @@ static void tasklet_action(struct softirq_action *a) } local_irq_disable(); - t->next = __get_cpu_var(tasklet_vec).list; - __get_cpu_var(tasklet_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_vec).tail = t; + __get_cpu_var(tasklet_vec).tail = &(t->next); __raise_softirq_irqoff(TASKLET_SOFTIRQ); local_irq_enable(); } @@ -428,8 +433,9 @@ static void tasklet_hi_action(struct softirq_action *a) struct tasklet_struct *list; local_irq_disable(); - list = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = NULL; + list = __get_cpu_var(tasklet_hi_vec).head; + __get_cpu_var(tasklet_hi_vec).head = NULL; + __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; local_irq_enable(); while (list) { @@ -449,8 +455,9 @@ static void tasklet_hi_action(struct softirq_action *a) } local_irq_disable(); - t->next = __get_cpu_var(tasklet_hi_vec).list; - __get_cpu_var(tasklet_hi_vec).list = t; + t->next = NULL; + *__get_cpu_var(tasklet_hi_vec).tail = t; + __get_cpu_var(tasklet_hi_vec).tail = &(t->next); __raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); } @@ -487,6 +494,15 @@ EXPORT_SYMBOL(tasklet_kill); void __init softirq_init(void) { + int cpu; + + for_each_possible_cpu(cpu) { + per_cpu(tasklet_vec, cpu).tail = + &per_cpu(tasklet_vec, cpu).head; + per_cpu(tasklet_hi_vec, cpu).tail = + &per_cpu(tasklet_hi_vec, cpu).head; + } + open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); } @@ -555,9 +571,12 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) return; /* CPU is dead, so no lock needed. */ - for (i = &per_cpu(tasklet_vec, cpu).list; *i; i = &(*i)->next) { + for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { if (*i == t) { *i = t->next; + /* If this was the tail element, move the tail ptr */ + if (*i == NULL) + per_cpu(tasklet_vec, cpu).tail = i; return; } } @@ -566,20 +585,24 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) static void takeover_tasklets(unsigned int cpu) { - struct tasklet_struct **i; - /* CPU is dead, so no lock needed. */ local_irq_disable(); /* Find end, append list for that CPU. */ - for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next); - *i = per_cpu(tasklet_vec, cpu).list; - per_cpu(tasklet_vec, cpu).list = NULL; + if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { + *(__get_cpu_var(tasklet_vec).tail) = per_cpu(tasklet_vec, cpu).head; + __get_cpu_var(tasklet_vec).tail = per_cpu(tasklet_vec, cpu).tail; + per_cpu(tasklet_vec, cpu).head = NULL; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; + } raise_softirq_irqoff(TASKLET_SOFTIRQ); - for (i = &__get_cpu_var(tasklet_hi_vec).list; *i; i = &(*i)->next); - *i = per_cpu(tasklet_hi_vec, cpu).list; - per_cpu(tasklet_hi_vec, cpu).list = NULL; + if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { + *__get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).head; + __get_cpu_var(tasklet_hi_vec).tail = per_cpu(tasklet_hi_vec, cpu).tail; + per_cpu(tasklet_hi_vec, cpu).head = NULL; + per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + } raise_softirq_irqoff(HI_SOFTIRQ); local_irq_enable(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 01b6522fd92..c828c2339cc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -void touch_softlockup_watchdog(void) +static void __touch_softlockup_watchdog(void) { int this_cpu = raw_smp_processor_id(); __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); } + +void touch_softlockup_watchdog(void) +{ + __raw_get_cpu_var(touch_timestamp) = 0; +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -80,7 +85,7 @@ void softlockup_tick(void) unsigned long now; if (touch_timestamp == 0) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -95,7 +100,7 @@ void softlockup_tick(void) /* do not print during early bootup: */ if (unlikely(system_state != SYSTEM_RUNNING)) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); return; } @@ -214,7 +219,7 @@ static int watchdog(void *__bind_cpu) sched_setscheduler(current, SCHED_FIFO, ¶m); /* initialize timestamp */ - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); set_current_state(TASK_INTERRUPTIBLE); /* @@ -223,7 +228,7 @@ static int watchdog(void *__bind_cpu) * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - touch_softlockup_watchdog(); + __touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 6f4e0e13f70..b7350bbfb07 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -11,7 +11,6 @@ #include <linux/interrupt.h> #include <asm/atomic.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> /* Since we effect priority and affinity (both of which are visible @@ -35,7 +34,7 @@ static int stopmachine(void *cpu) int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); + set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu)); /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ @@ -63,8 +62,7 @@ static int stopmachine(void *cpu) * help our sisters onto their CPUs. */ if (!prepared && !irqs_disabled) yield(); - else - cpu_relax(); + cpu_relax(); } /* Ack: we are exiting. */ @@ -107,8 +105,10 @@ static int stop_machine(void) } /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) + while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { yield(); + cpu_relax(); + } /* If some failed, kill them all. */ if (ret < 0) { @@ -135,8 +135,7 @@ static void restart_machine(void) preempt_enable_no_resched(); } -struct stop_machine_data -{ +struct stop_machine_data { int (*fn)(void *); void *data; struct completion done; diff --git a/kernel/sys.c b/kernel/sys.c index a626116af5d..14e97282eb6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -67,6 +67,12 @@ #ifndef SET_ENDIAN # define SET_ENDIAN(a,b) (-EINVAL) #endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -972,8 +978,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) goto out; if (task_pgrp(p) != pgrp) { - detach_pid(p, PIDTYPE_PGID); - attach_pid(p, PIDTYPE_PGID, pgrp); + change_pid(p, PIDTYPE_PGID, pgrp); set_task_pgrp(p, pid_nr(pgrp)); } @@ -986,54 +991,67 @@ out: asmlinkage long sys_getpgid(pid_t pid) { + struct task_struct *p; + struct pid *grp; + int retval; + + rcu_read_lock(); if (!pid) - return task_pgrp_vnr(current); + grp = task_pgrp(current); else { - int retval; - struct task_struct *p; - - read_lock(&tasklist_lock); - p = find_task_by_vpid(pid); retval = -ESRCH; - if (p) { - retval = security_task_getpgid(p); - if (!retval) - retval = task_pgrp_vnr(p); - } - read_unlock(&tasklist_lock); - return retval; + p = find_task_by_vpid(pid); + if (!p) + goto out; + grp = task_pgrp(p); + if (!grp) + goto out; + + retval = security_task_getpgid(p); + if (retval) + goto out; } + retval = pid_vnr(grp); +out: + rcu_read_unlock(); + return retval; } #ifdef __ARCH_WANT_SYS_GETPGRP asmlinkage long sys_getpgrp(void) { - /* SMP - assuming writes are word atomic this is fine */ - return task_pgrp_vnr(current); + return sys_getpgid(0); } #endif asmlinkage long sys_getsid(pid_t pid) { + struct task_struct *p; + struct pid *sid; + int retval; + + rcu_read_lock(); if (!pid) - return task_session_vnr(current); + sid = task_session(current); else { - int retval; - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); retval = -ESRCH; - if (p) { - retval = security_task_getsid(p); - if (!retval) - retval = task_session_vnr(p); - } - rcu_read_unlock(); - return retval; + p = find_task_by_vpid(pid); + if (!p) + goto out; + sid = task_session(p); + if (!sid) + goto out; + + retval = security_task_getsid(p); + if (retval) + goto out; } + retval = pid_vnr(sid); +out: + rcu_read_unlock(); + return retval; } asmlinkage long sys_setsid(void) @@ -1539,6 +1557,19 @@ out: * */ +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, + cputime_t *utimep, cputime_t *stimep) +{ + *utimep = cputime_add(*utimep, t->utime); + *stimep = cputime_add(*stimep, t->stime); + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + static void k_getrusage(struct task_struct *p, int who, struct rusage *r) { struct task_struct *t; @@ -1548,12 +1579,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - rcu_read_lock(); - if (!lock_task_sighand(p, &flags)) { - rcu_read_unlock(); - return; + if (who == RUSAGE_THREAD) { + accumulate_thread_rusage(p, r, &utime, &stime); + goto out; } + if (!lock_task_sighand(p, &flags)) + return; + switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: @@ -1580,14 +1613,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - r->ru_nvcsw += t->nvcsw; - r->ru_nivcsw += t->nivcsw; - r->ru_minflt += t->min_flt; - r->ru_majflt += t->maj_flt; - r->ru_inblock += task_io_get_inblock(t); - r->ru_oublock += task_io_get_oublock(t); + accumulate_thread_rusage(t, r, &utime, &stime); t = next_thread(t); } while (t != p); break; @@ -1595,10 +1621,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) default: BUG(); } - unlock_task_sighand(p, &flags); - rcu_read_unlock(); +out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -1612,7 +1637,8 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru) asmlinkage long sys_getrusage(int who, struct rusage __user *ru) { - if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) return -EINVAL; return getrusage(current, who, ru); } @@ -1626,10 +1652,9 @@ asmlinkage long sys_umask(int mask) asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { - long error; + long error = 0; - error = security_task_prctl(option, arg2, arg3, arg4, arg5); - if (error) + if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) return error; switch (option) { @@ -1676,23 +1701,10 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, error = PR_TIMING_STATISTICAL; break; case PR_SET_TIMING: - if (arg2 == PR_TIMING_STATISTICAL) - error = 0; - else + if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; break; - case PR_GET_KEEPCAPS: - if (current->keep_capabilities) - error = 1; - break; - case PR_SET_KEEPCAPS: - if (arg2 != 0 && arg2 != 1) { - error = -EINVAL; - break; - } - current->keep_capabilities = arg2; - break; case PR_SET_NAME: { struct task_struct *me = current; unsigned char ncomm[sizeof(me->comm)]; @@ -1726,18 +1738,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_SECCOMP: error = prctl_set_seccomp(arg2); break; - - case PR_CAPBSET_READ: - if (!cap_valid(arg2)) - return -EINVAL; - return !!cap_raised(current->cap_bset, arg2); - case PR_CAPBSET_DROP: -#ifdef CONFIG_SECURITY_FILE_CAPABILITIES - return cap_prctl_drop(arg2); -#else - return -EINVAL; -#endif - + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2a2d6889ba..f6d2e57b99a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -38,6 +38,7 @@ #include <linux/writeback.h> #include <linux/hugetlb.h> #include <linux/initrd.h> +#include <linux/key.h> #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> @@ -80,6 +81,7 @@ extern int compat_log; extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; +extern int sysctl_nr_open_min, sysctl_nr_open_max; /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) @@ -130,8 +132,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -extern int sysctl_hz_timer; - #ifdef CONFIG_BSD_PROCESS_ACCT extern int acct_parm[]; #endif @@ -144,12 +144,6 @@ extern int no_unaligned_warning; extern int max_lock_depth; #endif -#ifdef CONFIG_SYSCTL_SYSCALL -static int parse_table(int __user *, int, void __user *, size_t __user *, - void __user *, size_t, struct ctl_table *); -#endif - - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -270,17 +264,6 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_batch_wakeup_granularity_ns", - .data = &sysctl_sched_batch_wakeup_granularity, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &min_wakeup_granularity_ns, - .extra2 = &max_wakeup_granularity_ns, - }, - { - .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), @@ -318,7 +301,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, @@ -326,7 +309,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, @@ -578,16 +561,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_NO_IDLE_HZ - { - .ctl_name = KERN_HZ_TIMER, - .procname = "hz_timer", - .data = &sysctl_hz_timer, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = KERN_S390_USER_DEBUG_LOGGING, .procname = "userprocess_debug", @@ -820,6 +793,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, +#ifdef CONFIG_KEYS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -1198,7 +1179,9 @@ static struct ctl_table fs_table[] = { .data = &sysctl_nr_open, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, }, { .ctl_name = FS_DENTRY, @@ -1441,6 +1424,76 @@ void register_sysctl_root(struct ctl_table_root *root) } #ifdef CONFIG_SYSCTL_SYSCALL +/* Perform the actual read/write of a sysctl table entry. */ +static int do_sysctl_strategy(struct ctl_table_root *root, + struct ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen) +{ + int op = 0, rc; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + if (sysctl_perm(root, table, op)) + return -EPERM; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + rc = sysctl_data(table, name, nlen, oldval, oldlenp, + newval, newlen); + if (rc < 0) + return rc; + } + return 0; +} + +static int parse_table(int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + struct ctl_table_root *root, + struct ctl_table *table) +{ + int n; +repeat: + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name || table->procname; table++) { + if (!table->ctl_name) + continue; + if (n == table->ctl_name) { + int error; + if (table->child) { + if (sysctl_perm(root, table, 001)) + return -EPERM; + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(root, table, name, nlen, + oldval, oldlenp, + newval, newlen); + return error; + } + } + return -ENOTDIR; +} + int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1458,7 +1511,8 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) { error = parse_table(name, nlen, oldval, oldlenp, - newval, newlen, head->ctl_table); + newval, newlen, + head->root, head->ctl_table); if (error != -ENOTDIR) { sysctl_head_finish(head); break; @@ -1504,84 +1558,22 @@ static int test_perm(int mode, int op) return -EACCES; } -int sysctl_perm(struct ctl_table *table, int op) +int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) { int error; + int mode; + error = security_sysctl(table, op); if (error) return error; - return test_perm(table->mode, op); -} - -#ifdef CONFIG_SYSCTL_SYSCALL -static int parse_table(int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - struct ctl_table *table) -{ - int n; -repeat: - if (!nlen) - return -ENOTDIR; - if (get_user(n, name)) - return -EFAULT; - for ( ; table->ctl_name || table->procname; table++) { - if (!table->ctl_name) - continue; - if (n == table->ctl_name) { - int error; - if (table->child) { - if (sysctl_perm(table, 001)) - return -EPERM; - name++; - nlen--; - table = table->child; - goto repeat; - } - error = do_sysctl_strategy(table, name, nlen, - oldval, oldlenp, - newval, newlen); - return error; - } - } - return -ENOTDIR; -} - -/* Perform the actual read/write of a sysctl table entry. */ -int do_sysctl_strategy (struct ctl_table *table, - int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int op = 0, rc; - - if (oldval) - op |= 004; - if (newval) - op |= 002; - if (sysctl_perm(table, op)) - return -EPERM; - if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - if (rc > 0) - return 0; - } + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; - /* If there is no strategy routine, or if the strategy returns - * zero, proceed with automatic r/w */ - if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); - if (rc < 0) - return rc; - } - return 0; + return test_perm(mode, op); } -#endif /* CONFIG_SYSCTL_SYSCALL */ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) { @@ -1594,9 +1586,13 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) static __init int sysctl_init(void) { - int err; sysctl_set_parent(NULL, root_table); - err = sysctl_check_table(current->nsproxy, root_table); +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + { + int err; + err = sysctl_check_table(current->nsproxy, root_table); + } +#endif return 0; } @@ -1723,10 +1719,12 @@ struct ctl_table_header *__register_sysctl_paths( header->unregistering = NULL; header->root = root; sysctl_set_parent(NULL, header->ctl_table); +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK if (sysctl_check_table(namespaces, header->ctl_table)) { kfree(header); return NULL; } +#endif spin_lock(&sysctl_lock); header_list = lookup_header_list(root, namespaces); list_add_tail(&header->ctl_entry, header_list); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 07e86a82807..4a23517169a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -183,7 +183,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, if (!tsk) { rcu_read_lock(); - tsk = find_task_by_pid(pid); + tsk = find_task_by_vpid(pid); if (tsk) get_task_struct(tsk); rcu_read_unlock(); @@ -230,7 +230,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, */ rcu_read_lock(); if (!first) - first = find_task_by_pid(tgid); + first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; @@ -547,7 +547,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) if (!stats) goto err; - rc = fill_pid(tsk->pid, tsk, stats); + rc = fill_pid(-1, tsk, stats); if (rc < 0) goto err; diff --git a/kernel/time.c b/kernel/time.c index a5ec013b6c8..6a08660b4fa 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -35,6 +35,8 @@ #include <linux/syscalls.h> #include <linux/security.h> #include <linux/fs.h> +#include <linux/slab.h> +#include <linux/math64.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -244,7 +246,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 - return ((u64)HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; # else return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; # endif @@ -260,7 +262,7 @@ unsigned int inline jiffies_to_usecs(const unsigned long j) return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); #else # if BITS_PER_LONG == 32 - return ((u64)HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; # else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; # endif @@ -379,6 +381,7 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) ts->tv_sec = sec; ts->tv_nsec = nsec; } +EXPORT_SYMBOL(set_normalized_timespec); /** * ns_to_timespec - Convert nanoseconds to timespec @@ -389,13 +392,17 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) struct timespec ns_to_timespec(const s64 nsec) { struct timespec ts; + s32 rem; if (!nsec) return (struct timespec) {0, 0}; - ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec); - if (unlikely(nsec < 0)) - set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec); + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; return ts; } @@ -469,7 +476,7 @@ unsigned long msecs_to_jiffies(const unsigned int m) if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; - return ((u64)MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; #endif } @@ -484,7 +491,7 @@ unsigned long usecs_to_jiffies(const unsigned int u) #elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) return u * (HZ / USEC_PER_SEC); #else - return ((u64)USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; #endif } @@ -525,8 +532,10 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) * Convert jiffies to nanoseconds and separate with * one divide. */ - u64 nsec = (u64)jiffies * TICK_NSEC; - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &value->tv_nsec); + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; } EXPORT_SYMBOL(jiffies_to_timespec); @@ -564,12 +573,11 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) * Convert jiffies to nanoseconds and separate with * one divide. */ - u64 nsec = (u64)jiffies * TICK_NSEC; - long tv_usec; + u32 rem; - value->tv_sec = div_long_long_rem(nsec, NSEC_PER_SEC, &tv_usec); - tv_usec /= NSEC_PER_USEC; - value->tv_usec = tv_usec; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; } EXPORT_SYMBOL(jiffies_to_timeval); @@ -585,9 +593,7 @@ clock_t jiffies_to_clock_t(long x) return x / (HZ / USER_HZ); # endif #else - u64 tmp = (u64)x * TICK_NSEC; - do_div(tmp, (NSEC_PER_SEC / USER_HZ)); - return (long)tmp; + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); #endif } EXPORT_SYMBOL(jiffies_to_clock_t); @@ -599,16 +605,12 @@ unsigned long clock_t_to_jiffies(unsigned long x) return ~0UL; return x * (HZ / USER_HZ); #else - u64 jif; - /* Don't worry about loss of precision here .. */ if (x >= ~0UL / HZ * USER_HZ) return ~0UL; /* .. but do try to contain it here */ - jif = x * (u64) HZ; - do_div(jif, USER_HZ); - return jif; + return div_u64((u64)x * HZ, USER_HZ); #endif } EXPORT_SYMBOL(clock_t_to_jiffies); @@ -617,10 +619,9 @@ u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ - x *= USER_HZ; - do_div(x, HZ); + x = div_u64(x * USER_HZ, HZ); # elif HZ > USER_HZ - do_div(x, HZ / USER_HZ); + x = div_u64(x, HZ / USER_HZ); # else /* Nothing to do */ # endif @@ -630,8 +631,7 @@ u64 jiffies_64_to_clock_t(u64 x) * but even this doesn't overflow in hundreds of years * in 64 bits, so.. */ - x *= TICK_NSEC; - do_div(x, (NSEC_PER_SEC / USER_HZ)); + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); #endif return x; } @@ -640,21 +640,17 @@ EXPORT_SYMBOL(jiffies_64_to_clock_t); u64 nsec_to_clock_t(u64 x) { #if (NSEC_PER_SEC % USER_HZ) == 0 - do_div(x, (NSEC_PER_SEC / USER_HZ)); + return div_u64(x, NSEC_PER_SEC / USER_HZ); #elif (USER_HZ % 512) == 0 - x *= USER_HZ/512; - do_div(x, (NSEC_PER_SEC / 512)); + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); #else /* * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, * overflow after 64.99 years. * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... */ - x *= 9; - do_div(x, (unsigned long)((9ull * NSEC_PER_SEC + (USER_HZ/2)) / - USER_HZ)); + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); #endif - return x; } #if (BITS_PER_LONG < 64) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 278534bbca9..dadde5361f3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,8 +141,16 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - __mod_timer(&watchdog_timer, - watchdog_timer.expires + WATCHDOG_INTERVAL); + /* + * Cycle through CPUs to check if the CPUs stay + * synchronized to each other. + */ + int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); + + if (next_cpu >= NR_CPUS) + next_cpu = first_cpu(cpu_online_map); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); } spin_unlock(&watchdog_lock); } @@ -164,7 +172,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer(&watchdog_timer); + add_timer_on(&watchdog_timer, + first_cpu(cpu_online_map)); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -174,7 +183,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ @@ -185,7 +194,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer(&watchdog_timer); + add_timer_on(&watchdog_timer, + first_cpu(cpu_online_map)); } } } @@ -222,6 +232,18 @@ void clocksource_resume(void) } /** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. + * + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** * clocksource_get_next - Returns the selected clocksource * */ @@ -449,10 +471,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) /* * Sysfs setup bits: */ -static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, +static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, sysfs_override_clocksource); -static SYSDEV_ATTR(available_clocksource, 0600, +static SYSDEV_ATTR(available_clocksource, 0444, sysfs_show_available_clocksources, NULL); static struct sysdev_class clocksource_sysclass = { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5fd9b946977..5125ddd8196 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -15,7 +15,8 @@ #include <linux/jiffies.h> #include <linux/hrtimer.h> #include <linux/capability.h> -#include <asm/div64.h> +#include <linux/math64.h> +#include <linux/clocksource.h> #include <asm/timex.h> /* @@ -23,11 +24,14 @@ */ unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */ unsigned long tick_nsec; /* ACTHZ period (nsec) */ -static u64 tick_length, tick_length_base; +u64 tick_length; +static u64 tick_length_base; + +static struct hrtimer leap_timer; #define MAX_TICKADJ 500 /* microsecs */ #define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \ - TICK_LENGTH_SHIFT) / NTP_INTERVAL_FREQ) + NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) /* * phase-lock loop variables @@ -35,11 +39,12 @@ static u64 tick_length, tick_length_base; /* TIME_ERROR prevents overwriting the CMOS clock */ static int time_state = TIME_OK; /* clock synchronization status */ int time_status = STA_UNSYNC; /* clock status bits */ -static s64 time_offset; /* time adjustment (ns) */ +static long time_tai; /* TAI offset (s) */ +static s64 time_offset; /* time adjustment (ns) */ static long time_constant = 2; /* pll time constant */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_freq; /* frequency offset (scaled ppm)*/ +static s64 time_freq; /* frequency offset (scaled ns/s)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; static long ntp_tick_adj; @@ -47,16 +52,56 @@ static long ntp_tick_adj; static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) - << TICK_LENGTH_SHIFT; - second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; - second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); + << NTP_SCALE_SHIFT; + second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT; + second_length += time_freq; tick_length_base = second_length; - do_div(second_length, HZ); - tick_nsec = second_length >> TICK_LENGTH_SHIFT; + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ); +} + +static void ntp_update_offset(long offset) +{ + long mtemp; + s64 freq_adj; + + if (!(time_status & STA_PLL)) + return; - do_div(tick_length_base, NTP_INTERVAL_FREQ); + if (!(time_status & STA_NANO)) + offset *= NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + offset = min(offset, MAXPHASE); + offset = max(offset, -MAXPHASE); + + /* + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = xtime.tv_sec; + mtemp = xtime.tv_sec - time_reftime; + time_reftime = xtime.tv_sec; + + freq_adj = (s64)offset * mtemp; + freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant); + time_status &= ~STA_MODE; + if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL), + mtemp); + time_status |= STA_MODE; + } + freq_adj += time_freq; + freq_adj = min(freq_adj, MAXFREQ_SCALED); + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } /** @@ -78,62 +123,70 @@ void ntp_clear(void) } /* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. */ -void second_overflow(void) +static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { - long time_adj; + enum hrtimer_restart res = HRTIMER_NORESTART; - /* Bump the maxerror field */ - time_maxerror += MAXFREQ >> SHIFT_USEC; - if (time_maxerror > NTP_PHASE_LIMIT) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } + write_seqlock_irq(&xtime_lock); - /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. The microtime() - * routine or external clock driver will insure that reported time is - * always monotonic. The ugly divides should be replaced. - */ switch (time_state) { case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; break; case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - wall_to_monotonic.tv_sec++; - time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: inserting leap second " - "23:59:60 UTC\n"); - } + xtime.tv_sec--; + wall_to_monotonic.tv_sec++; + time_state = TIME_OOP; + printk(KERN_NOTICE "Clock: " + "inserting leap second 23:59:60 UTC\n"); + leap_timer.expires = ktime_add_ns(leap_timer.expires, + NSEC_PER_SEC); + res = HRTIMER_RESTART; break; case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - wall_to_monotonic.tv_sec--; - time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: deleting leap second " - "23:59:59 UTC\n"); - } + xtime.tv_sec++; + time_tai--; + wall_to_monotonic.tv_sec--; + time_state = TIME_WAIT; + printk(KERN_NOTICE "Clock: " + "deleting leap second 23:59:59 UTC\n"); break; case TIME_OOP: + time_tai++; time_state = TIME_WAIT; - break; + /* fall through */ case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; + time_state = TIME_OK; + break; + } + update_vsyscall(&xtime, clock); + + write_sequnlock_irq(&xtime_lock); + + return res; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + s64 time_adj; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; } /* @@ -143,7 +196,7 @@ void second_overflow(void) tick_length = tick_length_base; time_adj = shift_right(time_offset, SHIFT_PLL + time_constant); time_offset -= time_adj; - tick_length += (s64)time_adj << (TICK_LENGTH_SHIFT - SHIFT_UPDATE); + tick_length += time_adj; if (unlikely(time_adjust)) { if (time_adjust > MAX_TICKADJ) { @@ -154,25 +207,12 @@ void second_overflow(void) tick_length -= MAX_TICKADJ_SCALED; } else { tick_length += (s64)(time_adjust * NSEC_PER_USEC / - NTP_INTERVAL_FREQ) << TICK_LENGTH_SHIFT; + NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; } } } -/* - * Return how long ticks are at the moment, that is, how much time - * update_wall_time_one_tick will add to xtime next time we call it - * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds shifted by the - * specified number of bits to the right of the binary point. - * This function has no side-effects. - */ -u64 current_tick_length(void) -{ - return tick_length; -} - #ifdef CONFIG_GENERIC_CMOS_UPDATE /* Disable the cmos update - used by virtualization and embedded */ @@ -236,8 +276,8 @@ static inline void notify_cmos_timer(void) { } */ int do_adjtimex(struct timex *txc) { - long mtemp, save_adjust, rem; - s64 freq_adj, temp64; + struct timespec ts; + long save_adjust, sec; int result; /* In order to modify anything, you gotta be super-user! */ @@ -247,147 +287,132 @@ int do_adjtimex(struct timex *txc) /* Now we validate the data before disabling interrupts */ if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT && - txc->modes != ADJ_OFFSET_SS_READ) + /* singleshot must not be used with any other mode bits */ + if (txc->modes & ~ADJ_OFFSET_SS_READ) return -EINVAL; } - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - /* if the quartz is off by more than 10% something is VERY wrong ! */ if (txc->modes & ADJ_TICK) if (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ) return -EINVAL; + if (time_state != TIME_OK && txc->modes & ADJ_STATUS) + hrtimer_cancel(&leap_timer); + getnstimeofday(&ts); + write_seqlock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ /* Save for later - semantics of adjtime is to return old value */ save_adjust = time_adjust; -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = ((s64)txc->freq * NSEC_PER_USEC) - >> (SHIFT_USEC - SHIFT_NSEC); - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; + if (txc->modes) { + if (txc->modes & ADJ_STATUS) { + if ((time_status & STA_PLL) && + !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + } + /* only set allowed bits */ + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; + + switch (time_state) { + case TIME_OK: + start_timer: + sec = ts.tv_sec; + if (time_status & STA_INS) { + time_state = TIME_INS; + sec += 86400 - sec % 86400; + hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); + } else if (time_status & STA_DEL) { + time_state = TIME_DEL; + sec += 86400 - (sec + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS); + } + break; + case TIME_INS: + case TIME_DEL: + time_state = TIME_OK; + goto start_timer; + break; + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + case TIME_OOP: + hrtimer_restart(&leap_timer); + break; + } } - time_maxerror = txc->maxerror; - } - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = (s64)txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); } - time_esterror = txc->esterror; - } - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); } - time_constant = min(txc->constant + 4, (long)MAXTC); - } - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; + if (txc->modes & ADJ_TAI && txc->constant > 0) + time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) { + if (txc->modes == ADJ_OFFSET_SINGLESHOT) + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + else + ntp_update_offset(txc->offset); } - else if (time_status & STA_PLL) { - time_offset = txc->offset * NSEC_PER_USEC; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC); - time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC); - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - - freq_adj = time_offset * mtemp; - freq_adj = shift_right(freq_adj, time_constant * 2 + - (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); - if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { - u64 utemp64; - temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); - if (time_offset < 0) { - utemp64 = -temp64; - do_div(utemp64, mtemp); - freq_adj -= utemp64; - } else { - utemp64 = temp64; - do_div(utemp64, mtemp); - freq_adj += utemp64; - } - } - freq_adj += time_freq; - freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC); - time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC); - time_offset = div_long_long_rem_signed(time_offset, - NTP_INTERVAL_FREQ, - &rem); - time_offset <<= SHIFT_UPDATE; - } /* STA_PLL */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) - tick_usec = txc->tick; - - if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) - ntp_update_frequency(); - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); + } + + result = time_state; /* mostly `TIME_OK' */ + if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) + (txc->modes == ADJ_OFFSET_SS_READ)) txc->offset = save_adjust; - else - txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * - NTP_INTERVAL_FREQ / 1000; - txc->freq = (time_freq / NSEC_PER_USEC) << - (SHIFT_USEC - SHIFT_NSEC); + else { + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + } + txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, + NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; txc->constant = time_constant; txc->precision = 1; - txc->tolerance = MAXFREQ; + txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; + txc->tai = time_tai; /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -399,9 +424,15 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) txc->errcnt = 0; txc->stbcnt = 0; write_sequnlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); + + txc->time.tv_sec = ts.tv_sec; + txc->time.tv_usec = ts.tv_nsec; + if (!(time_status & STA_NANO)) + txc->time.tv_usec /= NSEC_PER_USEC; + notify_cmos_timer(); - return(result); + + return result; } static int __init ntp_tick_adj_setup(char *str) @@ -411,3 +442,10 @@ static int __init ntp_tick_adj_setup(char *str) } __setup("ntp_tick_adj=", ntp_tick_adj_setup); + +void __init ntp_init(void) +{ + ntp_clear(); + hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + leap_timer.function = ntp_leap_second; +} diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e1bd50cbbf5..57a1f02e5ec 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> @@ -262,7 +262,7 @@ out: void tick_broadcast_on_off(unsigned long reason, int *oncpu) { if (!cpu_isset(*oncpu, cpu_online_map)) - printk(KERN_ERR "tick-braodcast: ignoring broadcast for " + printk(KERN_ERR "tick-broadcast: ignoring broadcast for " "offline CPU #%d\n", *oncpu); else smp_call_function_single(*oncpu, tick_do_broadcast_on_off, diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1bea399a9ef..4f3886562b8 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -14,12 +14,14 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <asm/irq_regs.h> + #include "tick-internal.h" /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0258d3115d5..450c04935b6 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -14,7 +14,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> -#include <linux/irq.h> +#include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/profile.h> #include <linux/sched.h> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 686da821d37..b854a895591 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -158,9 +158,8 @@ void tick_nohz_stop_idle(int cpu) } } -static ktime_t tick_nohz_start_idle(int cpu) +static ktime_t tick_nohz_start_idle(struct tick_sched *ts) { - struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t now, delta; now = ktime_get(); @@ -192,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -201,8 +199,8 @@ void tick_nohz_stop_sched_tick(void) local_irq_save(flags); cpu = smp_processor_id(); - now = tick_nohz_start_idle(cpu); ts = &per_cpu(tick_cpu_sched, cpu); + now = tick_nohz_start_idle(ts); /* * If this cpu is offline and it is the one which updates @@ -222,7 +220,6 @@ void tick_nohz_stop_sched_tick(void) if (need_resched()) goto end; - cpu = smp_processor_id(); if (unlikely(local_softirq_pending())) { static int ratelimit; @@ -245,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - rt_jiffies = rt_needs_cpu(cpu); - if (rt_jiffies && rt_jiffies < delta_jiffies) - delta_jiffies = rt_jiffies; - if (rcu_needs_cpu(cpu)) delta_jiffies = 1; /* @@ -400,6 +393,7 @@ void tick_nohz_restart_sched_tick(void) sub_preempt_count(HARDIRQ_OFFSET); } + touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a3fa587c350..e91c29f961c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -53,7 +53,7 @@ void update_xtime_cache(u64 nsec) timespec_add_ns(&xtime_cache, nsec); } -static struct clocksource *clock; /* pointer to current clocksource */ +struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME @@ -178,6 +178,7 @@ static void change_clocksource(void) if (clock == new) return; + new->cycle_last = 0; now = clocksource_read(new); nsec = __get_nsec_offset(); timespec_add_ns(&xtime, nsec); @@ -245,7 +246,7 @@ void __init timekeeping_init(void) write_seqlock_irqsave(&xtime_lock, flags); - ntp_clear(); + ntp_init(); clock = clocksource_get_next(); clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev) timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ + clock->cycle_last = 0; clock->cycle_last = clocksource_read(clock); clock->error = 0; timekeeping_suspended = 0; @@ -369,7 +371,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * here. This is tuned so that an error of about 1 msec is adjusted * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). */ - error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); error2 = abs(error2); for (look_ahead = 0; error2 > 0; look_ahead++) error2 >>= 2; @@ -378,8 +380,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = current_tick_length() >> - (TICK_LENGTH_SHIFT - clock->shift + 1); + tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); tick_error -= clock->xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -410,7 +411,7 @@ static void clocksource_adjust(s64 offset) s64 error, interval = clock->cycle_interval; int adj; - error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); if (error > interval) { error >>= 2; if (likely(error <= interval)) @@ -432,7 +433,7 @@ static void clocksource_adjust(s64 offset) clock->xtime_interval += interval; clock->xtime_nsec -= offset; clock->error -= (interval - offset) << - (TICK_LENGTH_SHIFT - clock->shift); + (NTP_SCALE_SHIFT - clock->shift); } /** @@ -471,8 +472,8 @@ void update_wall_time(void) } /* accumulate error between NTP and clock interval */ - clock->error += current_tick_length(); - clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + clock->error += tick_length; + clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); } /* correct the clock when NTP error is too big */ diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 67fe8fc21fb..a40e20fd000 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -278,12 +278,9 @@ static int __init init_timer_list_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("timer_list", 0644, NULL); + pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &timer_list_fops; - return 0; } __initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 417da8c5bc7..c994530d166 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -415,12 +415,9 @@ static int __init init_tstats_procfs(void) { struct proc_dir_entry *pe; - pe = create_proc_entry("timer_stats", 0644, NULL); + pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); if (!pe) return -ENOMEM; - - pe->proc_fops = &tstats_fops; - return 0; } __initcall(init_tstats_procfs); diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl index 41468035473..eb51d76e058 100644 --- a/kernel/timeconst.pl +++ b/kernel/timeconst.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl # ----------------------------------------------------------------------- # -# Copyright 2007 rPath, Inc. - All Rights Reserved +# Copyright 2007-2008 rPath, Inc. - All Rights Reserved # # This file is part of the Linux kernel, and is made available under # the terms of the GNU General Public License version 2 or (at your @@ -20,198 +20,138 @@ %canned_values = ( 24 => [ '0xa6aaaaab','0x2aaaaaa',26, - '0xa6aaaaaaaaaaaaab','0x2aaaaaaaaaaaaaa',58, 125,3, '0xc49ba5e4','0x1fbe76c8b4',37, - '0xc49ba5e353f7ceda','0x1fbe76c8b439581062',69, 3,125, '0xa2c2aaab','0xaaaa',16, - '0xa2c2aaaaaaaaaaab','0xaaaaaaaaaaaa',48, 125000,3, '0xc9539b89','0x7fffbce4217d',47, - '0xc9539b8887229e91','0x7fffbce4217d2849cb25',79, 3,125000, ], 32 => [ '0xfa000000','0x6000000',27, - '0xfa00000000000000','0x600000000000000',59, 125,4, '0x83126e98','0xfdf3b645a',36, - '0x83126e978d4fdf3c','0xfdf3b645a1cac0831',68, 4,125, '0xf4240000','0x0',17, - '0xf424000000000000','0x0',49, 31250,1, '0x8637bd06','0x3fff79c842fa',46, - '0x8637bd05af6c69b6','0x3fff79c842fa5093964a',78, 1,31250, ], 48 => [ '0xa6aaaaab','0x6aaaaaa',27, - '0xa6aaaaaaaaaaaaab','0x6aaaaaaaaaaaaaa',59, 125,6, '0xc49ba5e4','0xfdf3b645a',36, - '0xc49ba5e353f7ceda','0xfdf3b645a1cac0831',68, 6,125, '0xa2c2aaab','0x15555',17, - '0xa2c2aaaaaaaaaaab','0x1555555555555',49, 62500,3, '0xc9539b89','0x3fffbce4217d',46, - '0xc9539b8887229e91','0x3fffbce4217d2849cb25',78, 3,62500, ], 64 => [ '0xfa000000','0xe000000',28, - '0xfa00000000000000','0xe00000000000000',60, 125,8, '0x83126e98','0x7ef9db22d',35, - '0x83126e978d4fdf3c','0x7ef9db22d0e560418',67, 8,125, '0xf4240000','0x0',18, - '0xf424000000000000','0x0',50, 15625,1, '0x8637bd06','0x1fff79c842fa',45, - '0x8637bd05af6c69b6','0x1fff79c842fa5093964a',77, 1,15625, ], 100 => [ '0xa0000000','0x0',28, - '0xa000000000000000','0x0',60, 10,1, '0xcccccccd','0x733333333',35, - '0xcccccccccccccccd','0x73333333333333333',67, 1,10, '0x9c400000','0x0',18, - '0x9c40000000000000','0x0',50, 10000,1, '0xd1b71759','0x1fff2e48e8a7',45, - '0xd1b71758e219652c','0x1fff2e48e8a71de69ad4',77, 1,10000, ], 122 => [ '0x8325c53f','0xfbcda3a',28, - '0x8325c53ef368eb05','0xfbcda3ac10c9714',60, 500,61, '0xf9db22d1','0x7fbe76c8b',35, - '0xf9db22d0e560418a','0x7fbe76c8b43958106',67, 61,500, '0x8012e2a0','0x3ef36',18, - '0x8012e29f79b47583','0x3ef368eb04325',50, 500000,61, '0xffda4053','0x1ffffbce4217',45, - '0xffda4052d666a983','0x1ffffbce4217d2849cb2',77, 61,500000, ], 128 => [ '0xfa000000','0x1e000000',29, - '0xfa00000000000000','0x1e00000000000000',61, 125,16, '0x83126e98','0x3f7ced916',34, - '0x83126e978d4fdf3c','0x3f7ced916872b020c',66, 16,125, '0xf4240000','0x40000',19, - '0xf424000000000000','0x4000000000000',51, 15625,2, '0x8637bd06','0xfffbce4217d',44, - '0x8637bd05af6c69b6','0xfffbce4217d2849cb25',76, 2,15625, ], 200 => [ '0xa0000000','0x0',29, - '0xa000000000000000','0x0',61, 5,1, '0xcccccccd','0x333333333',34, - '0xcccccccccccccccd','0x33333333333333333',66, 1,5, '0x9c400000','0x0',19, - '0x9c40000000000000','0x0',51, 5000,1, '0xd1b71759','0xfff2e48e8a7',44, - '0xd1b71758e219652c','0xfff2e48e8a71de69ad4',76, 1,5000, ], 250 => [ '0x80000000','0x0',29, - '0x8000000000000000','0x0',61, 4,1, '0x80000000','0x180000000',33, - '0x8000000000000000','0x18000000000000000',65, 1,4, '0xfa000000','0x0',20, - '0xfa00000000000000','0x0',52, 4000,1, '0x83126e98','0x7ff7ced9168',43, - '0x83126e978d4fdf3c','0x7ff7ced916872b020c4',75, 1,4000, ], 256 => [ '0xfa000000','0x3e000000',30, - '0xfa00000000000000','0x3e00000000000000',62, 125,32, '0x83126e98','0x1fbe76c8b',33, - '0x83126e978d4fdf3c','0x1fbe76c8b43958106',65, 32,125, '0xf4240000','0xc0000',20, - '0xf424000000000000','0xc000000000000',52, 15625,4, '0x8637bd06','0x7ffde7210be',43, - '0x8637bd05af6c69b6','0x7ffde7210be9424e592',75, 4,15625, ], 300 => [ '0xd5555556','0x2aaaaaaa',30, - '0xd555555555555556','0x2aaaaaaaaaaaaaaa',62, 10,3, '0x9999999a','0x1cccccccc',33, - '0x999999999999999a','0x1cccccccccccccccc',65, 3,10, '0xd0555556','0xaaaaa',20, - '0xd055555555555556','0xaaaaaaaaaaaaa',52, 10000,3, '0x9d495183','0x7ffcb923a29',43, - '0x9d495182a9930be1','0x7ffcb923a29c779a6b5',75, 3,10000, ], 512 => [ '0xfa000000','0x7e000000',31, - '0xfa00000000000000','0x7e00000000000000',63, 125,64, '0x83126e98','0xfdf3b645',32, - '0x83126e978d4fdf3c','0xfdf3b645a1cac083',64, 64,125, '0xf4240000','0x1c0000',21, - '0xf424000000000000','0x1c000000000000',53, 15625,8, '0x8637bd06','0x3ffef39085f',42, - '0x8637bd05af6c69b6','0x3ffef39085f4a1272c9',74, 8,15625, ], 1000 => [ '0x80000000','0x0',31, - '0x8000000000000000','0x0',63, 1,1, '0x80000000','0x0',31, - '0x8000000000000000','0x0',63, 1,1, '0xfa000000','0x0',22, - '0xfa00000000000000','0x0',54, 1000,1, '0x83126e98','0x1ff7ced9168',41, - '0x83126e978d4fdf3c','0x1ff7ced916872b020c4',73, 1,1000, ], 1024 => [ '0xfa000000','0xfe000000',32, - '0xfa00000000000000','0xfe00000000000000',64, 125,128, '0x83126e98','0x7ef9db22',31, - '0x83126e978d4fdf3c','0x7ef9db22d0e56041',63, 128,125, '0xf4240000','0x3c0000',22, - '0xf424000000000000','0x3c000000000000',54, 15625,16, '0x8637bd06','0x1fff79c842f',41, - '0x8637bd05af6c69b6','0x1fff79c842fa5093964',73, 16,15625, ], 1200 => [ '0xd5555556','0xd5555555',32, - '0xd555555555555556','0xd555555555555555',64, 5,6, '0x9999999a','0x66666666',31, - '0x999999999999999a','0x6666666666666666',63, 6,5, '0xd0555556','0x2aaaaa',22, - '0xd055555555555556','0x2aaaaaaaaaaaaa',54, 2500,3, '0x9d495183','0x1ffcb923a29',41, - '0x9d495182a9930be1','0x1ffcb923a29c779a6b5',73, 3,2500, ] ); @@ -264,6 +204,15 @@ sub fmuls($$$) { return 0; } +# Generate a hex value if the result fits in 64 bits; +# otherwise skip. +sub bignum_hex($) { + my($x) = @_; + my $s = $x->as_hex(); + + return (length($s) > 18) ? undef : $s; +} + # Provides mul, adj, and shr factors for a specific # (bit, time, hz) combination sub muladj($$$) { @@ -271,7 +220,7 @@ sub muladj($$$) { my $s = fmuls($b, $t, $hz); my $m = fmul($s, $t, $hz); my $a = fadj($s, $t, $hz); - return ($m->as_hex(), $a->as_hex(), $s); + return (bignum_hex($m), bignum_hex($a), $s); } # Provides numerator, denominator values @@ -288,12 +237,10 @@ sub conversions($$) { # HZ_TO_xx push(@val, muladj(32, $t, $hz)); - push(@val, muladj(64, $t, $hz)); push(@val, numden($t, $hz)); # xx_TO_HZ push(@val, muladj(32, $hz, $t)); - push(@val, muladj(64, $hz, $t)); push(@val, numden($hz, $t)); return @val; @@ -318,6 +265,19 @@ sub compute_values($) { return @val; } +sub outputval($$) +{ + my($name, $val) = @_; + my $csuf; + + if (defined($val)) { + if ($name !~ /SHR/) { + $val = "U64_C($val)"; + } + printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; + } +} + sub output($@) { my($hz, @val) = @_; @@ -331,6 +291,7 @@ sub output($@) print "\n"; print "#include <linux/param.h>\n"; + print "#include <linux/types.h>\n"; print "\n"; print "#if HZ != $hz\n"; @@ -340,15 +301,13 @@ sub output($@) foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', 'HZ_TO_USEC','USEC_TO_HZ') { - foreach $bit (32, 64) { + foreach $bit (32) { foreach $suf ('MUL', 'ADJ', 'SHR') { - printf "#define %-23s %s\n", - "${pfx}_$suf$bit", shift(@val); + outputval("${pfx}_$suf$bit", shift(@val)); } } foreach $suf ('NUM', 'DEN') { - printf "#define %-23s %s\n", - "${pfx}_$suf", shift(@val); + outputval("${pfx}_$suf", shift(@val)); } } @@ -356,6 +315,23 @@ sub output($@) print "#endif /* KERNEL_TIMECONST_H */\n"; } +# Pretty-print Perl values +sub perlvals(@) { + my $v; + my @l = (); + + foreach $v (@_) { + if (!defined($v)) { + push(@l, 'undef'); + } elsif ($v =~ /^0x/) { + push(@l, "\'".$v."\'"); + } else { + push(@l, $v.''); + } + } + return join(',', @l); +} + ($hz) = @ARGV; # Use this to generate the %canned_values structure @@ -373,15 +349,15 @@ if ($hz eq '--can') { print "$pf$hz => [\n"; while (scalar(@values)) { my $bit; - foreach $bit (32, 64) { + foreach $bit (32) { my $m = shift(@values); my $a = shift(@values); my $s = shift(@values); - print "\t\t\'",$m,"\',\'",$a,"\',",$s,",\n"; + print "\t\t", perlvals($m,$a,$s), ",\n"; } my $n = shift(@values); my $d = shift(@values); - print "\t\t",$n,',',$d,",\n"; + print "\t\t", perlvals($n,$d), ",\n"; } print "\t]"; $pf = ', '; diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88..ef3fa6906e8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -320,14 +320,130 @@ static void timer_stats_account_timer(struct timer_list *timer) static void timer_stats_account_timer(struct timer_list *timer) {} #endif -/** - * init_timer - initialize a timer. - * @timer: the timer to be initialized - * - * init_timer() must be done to a timer prior calling *any* of the - * other timer functions. +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +/* + * fixup_init is called when: + * - an active object is initialized */ -void init_timer(struct timer_list *timer) +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + WARN_ON_ONCE(1); + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static void __init_timer(struct timer_list *timer); + +void init_timer_on_stack(struct timer_list *timer) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + __init_timer(timer); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +#endif + +static void __init_timer(struct timer_list *timer) { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); @@ -337,6 +453,19 @@ void init_timer(struct timer_list *timer) memset(timer->start_comm, 0, TASK_COMM_LEN); #endif } + +/** + * init_timer - initialize a timer. + * @timer: the timer to be initialized + * + * init_timer() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer(struct timer_list *timer) +{ + debug_timer_init(timer); + __init_timer(timer); +} EXPORT_SYMBOL(init_timer); void init_timer_deferrable(struct timer_list *timer) @@ -351,6 +480,8 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; + debug_timer_deactivate(timer); + __list_del(entry->prev, entry->next); if (clear_pending) entry->next = NULL; @@ -405,6 +536,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) ret = 1; } + debug_timer_activate(timer); + new_base = __get_cpu_var(tvec_bases); if (base != new_base) { @@ -450,11 +583,20 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); + debug_timer_activate(timer); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -670,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) +#ifdef CONFIG_NO_HZ /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a cpus is idle. @@ -805,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) return cmp_next_hrtimer_event(now, expires); } - -#ifdef CONFIG_NO_IDLE_HZ -unsigned long next_timer_interrupt(void) -{ - return get_next_timer_interrupt(jiffies); -} -#endif - #endif #ifndef CONFIG_VIRT_CPU_ACCOUNTING @@ -1078,11 +1212,14 @@ signed long __sched schedule_timeout(signed long timeout) expire = timeout + jiffies; - setup_timer(&timer, process_timeout, (unsigned long)current); + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); __mod_timer(&timer, expire); schedule(); del_singleshot_timer_sync(&timer); + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + timeout = expire - jiffies; out: @@ -1220,13 +1357,6 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) return 0; } -/* - * lockdep: we want to track each per-CPU base as a separate lock-class, - * but timer-bases are kmalloc()-ed, so we need to attach separate - * keys to them: - */ -static struct lock_class_key base_lock_keys[NR_CPUS]; - static int __cpuinit init_timers_cpu(int cpu) { int j; @@ -1269,7 +1399,6 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); - lockdep_set_class(&base->lock, base_lock_keys + cpu); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); @@ -1308,8 +1437,8 @@ static void __cpuinit migrate_timers(int cpu) new_base = get_cpu_var(tvec_bases); local_irq_disable(); - double_spin_lock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_lock(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1322,8 +1451,8 @@ static void __cpuinit migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } - double_spin_unlock(&new_base->lock, &old_base->lock, - smp_processor_id() < cpu); + spin_unlock(&old_base->lock); + spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(tvec_bases); } diff --git a/kernel/uid16.c b/kernel/uid16.c index dd308ba4e03..3e41c1673e2 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -21,7 +21,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi { long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -29,7 +29,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g { long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, filename, user, group); return ret; } @@ -37,7 +37,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) { long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, fd, user, group); return ret; } @@ -45,7 +45,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) { long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, rgid, egid); return ret; } @@ -53,7 +53,7 @@ asmlinkage long sys_setgid16(old_gid_t gid) { long ret = sys_setgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } @@ -61,7 +61,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) { long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(2, ret, ruid, euid); return ret; } @@ -69,7 +69,7 @@ asmlinkage long sys_setuid16(old_uid_t uid) { long ret = sys_setuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -78,7 +78,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), low2highuid(suid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, ruid, euid, suid); return ret; } @@ -98,7 +98,7 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), low2highgid(sgid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(3, ret, rgid, egid, sgid); return ret; } @@ -117,7 +117,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid) { long ret = sys_setfsuid(low2highuid(uid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, uid); return ret; } @@ -125,7 +125,7 @@ asmlinkage long sys_setfsgid16(old_gid_t gid) { long ret = sys_setfsgid(low2highgid(gid)); /* avoid REGPARM breakage on x86: */ - prevent_tail_call(ret); + asmlinkage_protect(1, ret, gid); return ret; } diff --git a/kernel/user.c b/kernel/user.c index 7132022a040..865ecf57a09 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -53,10 +53,6 @@ struct user_struct root_user = { .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, -#ifdef CONFIG_KEYS - .uid_keyring = &root_user_keyring, - .session_keyring = &root_session_keyring, -#endif #ifdef CONFIG_USER_SCHED .tg = &init_task_group, #endif @@ -101,7 +97,7 @@ static int sched_create_user(struct user_struct *up) { int rc = 0; - up->tg = sched_create_group(); + up->tg = sched_create_group(&root_task_group); if (IS_ERR(up->tg)) rc = -ENOMEM; @@ -193,6 +189,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, static struct kobj_attribute cpu_rt_runtime_attr = __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); + +static ssize_t cpu_rt_period_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); +} + +static ssize_t cpu_rt_period_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_period; + int rc; + + sscanf(buf, "%lu", &rt_period); + + rc = sched_group_set_rt_period(up->tg, rt_period); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_period_attr = + __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); #endif /* default attributes per uid directory */ @@ -202,6 +225,7 @@ static struct attribute *uids_attributes[] = { #endif #ifdef CONFIG_RT_GROUP_SCHED &cpu_rt_runtime_attr.attr, + &cpu_rt_period_attr.attr, #endif NULL }; @@ -360,7 +384,7 @@ void free_uid(struct user_struct *up) local_irq_restore(flags); } -struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) +struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) { struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up, *new; @@ -375,29 +399,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_unlock_irq(&uidhash_lock); if (!up) { - new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); + new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) goto out_unlock; new->uid = uid; atomic_set(&new->__count, 1); - atomic_set(&new->processes, 0); - atomic_set(&new->files, 0); - atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY_USER - atomic_set(&new->inotify_watches, 0); - atomic_set(&new->inotify_devs, 0); -#endif -#ifdef CONFIG_POSIX_MQUEUE - new->mq_bytes = 0; -#endif - new->locked_shm = 0; - - if (alloc_uid_keyring(new, current) < 0) - goto out_free_user; if (sched_create_user(new) < 0) - goto out_put_keys; + goto out_free_user; if (uids_user_create(new)) goto out_destoy_sched; @@ -431,9 +441,6 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) out_destoy_sched: sched_destroy_user(new); -out_put_keys: - key_put(new->uid_keyring); - key_put(new->session_keyring); out_free_user: kmem_cache_free(uid_cachep, new); out_unlock: diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 4c9006275df..a9ab0596de4 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/version.h> #include <linux/nsproxy.h> +#include <linux/slab.h> #include <linux/user_namespace.h> /* @@ -73,3 +74,4 @@ void free_user_ns(struct kref *kref) release_uids(ns); kfree(ns); } +EXPORT_SYMBOL(free_user_ns); diff --git a/kernel/utsname.c b/kernel/utsname.c index 816d7b24fa0..64d398f1244 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <linux/version.h> #include <linux/err.h> +#include <linux/slab.h> /* * Clone a new ns copying an original utsname, setting refcount to 1 diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ff06611655a..29fc39f1029 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -158,8 +158,8 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, * * Returns 0 if @work was already on a queue, non-zero otherwise. * - * We queue the work to the CPU it was submitted, but there is no - * guarantee that it will be processed by that CPU. + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. */ int queue_work(struct workqueue_struct *wq, struct work_struct *work) { @@ -195,7 +195,6 @@ static void delayed_work_timer_fn(unsigned long __data) int queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { - timer_stats_timer_set_start_info(&dwork->timer); if (delay == 0) return queue_work(wq, &dwork->work); @@ -223,6 +222,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); + timer_stats_timer_set_start_info(&dwork->timer); + /* This stores cwq for the moment, for the timer_fn */ set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); timer->expires = jiffies + delay; @@ -246,7 +247,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) if (cwq->run_depth > 3) { /* morton gets to eat his hat */ printk("%s: recursion depth exceeded: %d\n", - __FUNCTION__, cwq->run_depth); + __func__, cwq->run_depth); dump_stack(); } while (!list_empty(&cwq->worklist)) { @@ -563,7 +564,6 @@ EXPORT_SYMBOL(schedule_work); int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - timer_stats_timer_set_start_info(&dwork->timer); return queue_delayed_work(keventd_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -770,7 +770,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* * Our caller is either destroy_workqueue() or CPU_DEAD, @@ -806,19 +806,16 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) void destroy_workqueue(struct workqueue_struct *wq) { const cpumask_t *cpu_map = wq_cpu_map(wq); - struct cpu_workqueue_struct *cwq; int cpu; get_online_cpus(); spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - put_online_cpus(); - for_each_cpu_mask(cpu, *cpu_map) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - cleanup_workqueue_thread(cwq, cpu); - } + for_each_cpu_mask(cpu, *cpu_map) + cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); + put_online_cpus(); free_percpu(wq->cpu_wq); kfree(wq); @@ -836,7 +833,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, action &= ~CPU_TASKS_FROZEN; switch (action) { - case CPU_UP_PREPARE: cpu_set(cpu, cpu_populated_map); } @@ -859,11 +855,17 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: start_workqueue_thread(cwq, -1); case CPU_DEAD: - cleanup_workqueue_thread(cwq, cpu); + cleanup_workqueue_thread(cwq); break; } } + switch (action) { + case CPU_UP_CANCELED: + case CPU_DEAD: + cpu_clear(cpu, cpu_populated_map); + } + return NOTIFY_OK; } |