diff options
author | Paul Mundt <lethal@linux-sh.org> | 2012-02-24 13:23:23 +0900 |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2012-02-24 13:23:23 +0900 |
commit | 35eb304b5cd7b49d581bda79218b8134f3b689ea (patch) | |
tree | 3d75d9ada70814161d035b2f9166fee05d257dfb /kernel | |
parent | ca0cc30109241f280eb871794620d7cf198bb582 (diff) | |
parent | bb4c7e9a9908548b458f34afb2fee74dc0d49f90 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into rmobile-fixes-for-linus
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 4 | ||||
-rw-r--r-- | kernel/audit.h | 6 | ||||
-rw-r--r-- | kernel/auditfilter.c | 17 | ||||
-rw-r--r-- | kernel/auditsc.c | 738 | ||||
-rw-r--r-- | kernel/capability.c | 2 | ||||
-rw-r--r-- | kernel/events/callchain.c | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 125 | ||||
-rw-r--r-- | kernel/exit.c | 19 | ||||
-rw-r--r-- | kernel/fork.c | 24 | ||||
-rw-r--r-- | kernel/kprobes.c | 8 | ||||
-rw-r--r-- | kernel/params.c | 3 | ||||
-rw-r--r-- | kernel/pid.c | 4 | ||||
-rw-r--r-- | kernel/power/power.h | 24 | ||||
-rw-r--r-- | kernel/power/process.c | 26 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 3 | ||||
-rw-r--r-- | kernel/power/user.c | 15 | ||||
-rw-r--r-- | kernel/rcutorture.c | 8 | ||||
-rw-r--r-- | kernel/relay.c | 10 | ||||
-rw-r--r-- | kernel/res_counter.c | 25 | ||||
-rw-r--r-- | kernel/sched/core.c | 19 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 34 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/seccomp.c | 2 | ||||
-rw-r--r-- | kernel/tracepoint.c | 7 | ||||
-rw-r--r-- | kernel/watchdog.c | 2 |
26 files changed, 750 insertions, 385 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 57e3f510793..bb0eb5bb9a0 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, } *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); - audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", pid, uid, auid, ses); if (sid) { rc = security_secid_to_secctx(sid, &ctx, &len); @@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, char *p, *pathname; if (prefix) - audit_log_format(ab, " %s", prefix); + audit_log_format(ab, "%s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); diff --git a/kernel/audit.h b/kernel/audit.h index 91e7071c4d2..81676680337 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -36,12 +36,8 @@ enum audit_state { AUDIT_DISABLED, /* Do not create per-task audit_context. * No syscall-specific audit records can * be generated. */ - AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, - * but don't necessarily fill it in at - * syscall entry time (i.e., filter - * instead). */ AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, - * and always fill it in at syscall + * and fill it in at syscall * entry time. This makes a full * syscall record available if some * other part of the kernel decides it diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f8277c80d67..a6c3f1abd20 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) switch(listnr) { default: goto exit_err; - case AUDIT_FILTER_USER: - case AUDIT_FILTER_TYPE: #ifdef CONFIG_AUDITSYSCALL case AUDIT_FILTER_ENTRY: + if (rule->action == AUDIT_ALWAYS) + goto exit_err; case AUDIT_FILTER_EXIT: case AUDIT_FILTER_TASK: #endif + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: ; } if (unlikely(rule->action == AUDIT_POSSIBLE)) { @@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) goto exit_free; break; case AUDIT_INODE: @@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_ARG1: case AUDIT_ARG2: case AUDIT_ARG3: + case AUDIT_OBJ_UID: + case AUDIT_OBJ_GID: break; case AUDIT_ARCH: entry->rule.arch_f = f; @@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILTERKEY: - err = -EINVAL; if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) goto exit_free; str = audit_unpack_string(&bufp, &remain, f->val); @@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, goto exit_free; break; case AUDIT_FILETYPE: - if ((f->val & ~S_IFMT) > S_IFMT) + if (f->val & ~S_IFMT) + goto exit_free; + break; + case AUDIT_FIELD_COMPARE: + if (f->val > AUDIT_MAX_FIELD_COMPARE) goto exit_free; break; default: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e7fe2b0d29b..af1de0f34ea 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -70,9 +70,15 @@ #include "audit.h" +/* flags stating the success for a syscall */ +#define AUDITSC_INVALID 0 +#define AUDITSC_SUCCESS 1 +#define AUDITSC_FAILURE 2 + /* AUDIT_NAMES is the number of slots we reserve in the audit_context - * for saving names from getname(). */ -#define AUDIT_NAMES 20 + * for saving names from getname(). If we get more names we will allocate + * a name dynamically and also add those to the list anchored by names_list. */ +#define AUDIT_NAMES 5 /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 @@ -101,9 +107,8 @@ struct audit_cap_data { * * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { + struct list_head list; /* audit_context->names_list */ const char *name; - int name_len; /* number of name's characters to log */ - unsigned name_put; /* call __putname() for this name */ unsigned long ino; dev_t dev; umode_t mode; @@ -113,6 +118,14 @@ struct audit_names { u32 osid; struct audit_cap_data fcap; unsigned int fcap_ver; + int name_len; /* number of name's characters to log */ + bool name_put; /* call __putname() for this name */ + /* + * This was an allocated audit_names and not from the array of + * names allocated in the task audit context. Thus this name + * should be freed on syscall exit + */ + bool should_free; }; struct audit_aux_data { @@ -174,8 +187,17 @@ struct audit_context { long return_code;/* syscall return code */ u64 prio; int return_valid; /* return code is valid */ - int name_count; - struct audit_names names[AUDIT_NAMES]; + /* + * The names_list is the list of all audit_names collected during this + * syscall. The first AUDIT_NAMES entries in the names_list will + * actually be from the preallocated_names array for performance + * reasons. Except during allocation they should never be referenced + * through the preallocated_names array and should only be found/used + * by running the names_list. + */ + struct audit_names preallocated_names[AUDIT_NAMES]; + int name_count; /* total records in names_list */ + struct list_head names_list; /* anchor for struct audit_names->list */ char * filterkey; /* key for rule that triggered record */ struct path pwd; struct audit_context *previous; /* For nested syscalls */ @@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask) } } -static int audit_match_filetype(struct audit_context *ctx, int which) +static int audit_match_filetype(struct audit_context *ctx, int val) { - unsigned index = which & ~S_IFMT; - umode_t mode = which & S_IFMT; + struct audit_names *n; + umode_t mode = (umode_t)val; if (unlikely(!ctx)) return 0; - if (index >= ctx->name_count) - return 0; - if (ctx->names[index].ino == -1) - return 0; - if ((ctx->names[index].mode ^ mode) & S_IFMT) - return 0; - return 1; + list_for_each_entry(n, &ctx->names_list, list) { + if ((n->ino != -1) && + ((n->mode & S_IFMT) == mode)) + return 1; + } + + return 0; } /* @@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) return 0; } +static int audit_compare_id(uid_t uid1, + struct audit_names *name, + unsigned long name_offset, + struct audit_field *f, + struct audit_context *ctx) +{ + struct audit_names *n; + unsigned long addr; + uid_t uid2; + int rc; + + BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); + + if (name) { + addr = (unsigned long)name; + addr += name_offset; + + uid2 = *(uid_t *)addr; + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + + if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + addr = (unsigned long)n; + addr += name_offset; + + uid2 = *(uid_t *)addr; + + rc = audit_comparator(uid1, f->op, uid2); + if (rc) + return rc; + } + } + return 0; +} + +static int audit_field_compare(struct task_struct *tsk, + const struct cred *cred, + struct audit_field *f, + struct audit_context *ctx, + struct audit_names *name) +{ + switch (f->val) { + /* process to file object comparisons */ + case AUDIT_COMPARE_UID_TO_OBJ_UID: + return audit_compare_id(cred->uid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_GID_TO_OBJ_GID: + return audit_compare_id(cred->gid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_EUID_TO_OBJ_UID: + return audit_compare_id(cred->euid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_EGID_TO_OBJ_GID: + return audit_compare_id(cred->egid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_AUID_TO_OBJ_UID: + return audit_compare_id(tsk->loginuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SUID_TO_OBJ_UID: + return audit_compare_id(cred->suid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_SGID_TO_OBJ_GID: + return audit_compare_id(cred->sgid, + name, offsetof(struct audit_names, gid), + f, ctx); + case AUDIT_COMPARE_FSUID_TO_OBJ_UID: + return audit_compare_id(cred->fsuid, + name, offsetof(struct audit_names, uid), + f, ctx); + case AUDIT_COMPARE_FSGID_TO_OBJ_GID: + return audit_compare_id(cred->fsgid, + name, offsetof(struct audit_names, gid), + f, ctx); + /* uid comparisons */ + case AUDIT_COMPARE_UID_TO_AUID: + return audit_comparator(cred->uid, f->op, tsk->loginuid); + case AUDIT_COMPARE_UID_TO_EUID: + return audit_comparator(cred->uid, f->op, cred->euid); + case AUDIT_COMPARE_UID_TO_SUID: + return audit_comparator(cred->uid, f->op, cred->suid); + case AUDIT_COMPARE_UID_TO_FSUID: + return audit_comparator(cred->uid, f->op, cred->fsuid); + /* auid comparisons */ + case AUDIT_COMPARE_AUID_TO_EUID: + return audit_comparator(tsk->loginuid, f->op, cred->euid); + case AUDIT_COMPARE_AUID_TO_SUID: + return audit_comparator(tsk->loginuid, f->op, cred->suid); + case AUDIT_COMPARE_AUID_TO_FSUID: + return audit_comparator(tsk->loginuid, f->op, cred->fsuid); + /* euid comparisons */ + case AUDIT_COMPARE_EUID_TO_SUID: + return audit_comparator(cred->euid, f->op, cred->suid); + case AUDIT_COMPARE_EUID_TO_FSUID: + return audit_comparator(cred->euid, f->op, cred->fsuid); + /* suid comparisons */ + case AUDIT_COMPARE_SUID_TO_FSUID: + return audit_comparator(cred->suid, f->op, cred->fsuid); + /* gid comparisons */ + case AUDIT_COMPARE_GID_TO_EGID: + return audit_comparator(cred->gid, f->op, cred->egid); + case AUDIT_COMPARE_GID_TO_SGID: + return audit_comparator(cred->gid, f->op, cred->sgid); + case AUDIT_COMPARE_GID_TO_FSGID: + return audit_comparator(cred->gid, f->op, cred->fsgid); + /* egid comparisons */ + case AUDIT_COMPARE_EGID_TO_SGID: + return audit_comparator(cred->egid, f->op, cred->sgid); + case AUDIT_COMPARE_EGID_TO_FSGID: + return audit_comparator(cred->egid, f->op, cred->fsgid); + /* sgid comparison */ + case AUDIT_COMPARE_SGID_TO_FSGID: + return audit_comparator(cred->sgid, f->op, cred->fsgid); + default: + WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); + return 0; + } + return 0; +} + /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. @@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk, bool task_creation) { const struct cred *cred; - int i, j, need_sid = 1; + int i, need_sid = 1; u32 sid; cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; + struct audit_names *n; int result = 0; switch (f->type) { @@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (name) - result = audit_comparator(MAJOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MAJOR(name->dev), f->op, f->val) || + audit_comparator(MAJOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MAJOR(n->dev), f->op, f->val) || + audit_comparator(MAJOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (name) - result = audit_comparator(MINOR(name->dev), - f->op, f->val); - else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + if (name) { + if (audit_comparator(MINOR(name->dev), f->op, f->val) || + audit_comparator(MINOR(name->rdev), f->op, f->val)) + ++result; + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(MINOR(n->dev), f->op, f->val) || + audit_comparator(MINOR(n->rdev), f->op, f->val)) { ++result; break; } @@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk, if (name) result = (name->ino == f->val); else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->ino, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_UID: + if (name) { + result = audit_comparator(name->uid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->uid, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_OBJ_GID: + if (name) { + result = audit_comparator(name->gid, f->op, f->val); + } else if (ctx) { + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_comparator(n->gid, f->op, f->val)) { ++result; break; } @@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk, name->osid, f->type, f->op, f->lsm_rule, ctx); } else if (ctx) { - for (j = 0; j < ctx->name_count; j++) { - if (security_audit_rule_match( - ctx->names[j].osid, - f->type, f->op, - f->lsm_rule, ctx)) { + list_for_each_entry(n, &ctx->names_list, list) { + if (security_audit_rule_match(n->osid, f->type, + f->op, f->lsm_rule, + ctx)) { ++result; break; } @@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); break; + case AUDIT_FIELD_COMPARE: + result = audit_field_compare(tsk, cred, f, ctx, name); + break; } - if (!result) return 0; } @@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } -/* At syscall exit time, this filter is called if any audit_names[] have been +/* + * Given an audit_name check the inode hash table to see if they match. + * Called holding the rcu read lock to protect the use of audit_inode_hash + */ +static int audit_filter_inode_name(struct task_struct *tsk, + struct audit_names *n, + struct audit_context *ctx) { + int word, bit; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + struct audit_entry *e; + enum audit_state state; + + word = AUDIT_WORD(ctx->major); + bit = AUDIT_BIT(ctx->major); + + if (list_empty(list)) + return 0; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { + ctx->current_state = state; + return 1; + } + } + + return 0; +} + +/* At syscall exit time, this filter is called if any audit_names have been * collected during syscall processing. We only check rules in sublists at hash - * buckets applicable to the inode numbers in audit_names[]. + * buckets applicable to the inode numbers in audit_names. * Regarding audit_state, same rules apply as for audit_filter_syscall(). */ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) { - int i; - struct audit_entry *e; - enum audit_state state; + struct audit_names *n; if (audit_pid && tsk->tgid == audit_pid) return; rcu_read_lock(); - for (i = 0; i < ctx->name_count; i++) { - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); - struct audit_names *n = &ctx->names[i]; - int h = audit_hash_ino((u32)n->ino); - struct list_head *list = &audit_inode_hash[h]; - - if (list_empty(list)) - continue; - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + list_for_each_entry(n, &ctx->names_list, list) { + if (audit_filter_inode_name(tsk, n, ctx)) + break; } rcu_read_unlock(); } @@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, { struct audit_context *context = tsk->audit_context; - if (likely(!context)) + if (!context) return NULL; context->return_valid = return_valid; @@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, static inline void audit_free_names(struct audit_context *context) { - int i; + struct audit_names *n, *next; #if AUDIT_DEBUG == 2 if (context->put_count + context->ino_count != context->name_count) { @@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context) context->serial, context->major, context->in_syscall, context->name_count, context->put_count, context->ino_count); - for (i = 0; i < context->name_count; i++) { + list_for_each_entry(n, &context->names_list, list) { printk(KERN_ERR "names[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); + n->name, n->name ?: "(null)"); } dump_stack(); return; @@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context) context->ino_count = 0; #endif - for (i = 0; i < context->name_count; i++) { - if (context->names[i].name && context->names[i].name_put) - __putname(context->names[i].name); + list_for_each_entry_safe(n, next, &context->names_list, list) { + list_del(&n->list); + if (n->name && n->name_put) + __putname(n->name); + if (n->should_free) + kfree(n); } context->name_count = 0; path_put(&context->pwd); @@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) return NULL; audit_zero_context(context, state); INIT_LIST_HEAD(&context->killed_trees); + INIT_LIST_HEAD(&context->names_list); return context; } @@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return 0; if (!(context = audit_alloc_context(state))) { @@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk while (vma) { if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) { - audit_log_d_path(ab, "exe=", + audit_log_d_path(ab, " exe=", &vma->vm_file->f_path); break; } @@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context, struct audit_buffer **ab, struct audit_aux_data_execve *axi) { - int i; - size_t len, len_sent = 0; + int i, len; + size_t len_sent = 0; const char __user *p; char *buf; @@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_end(ab); } +static void audit_log_name(struct audit_context *context, struct audit_names *n, + int record_num, int *call_panic) +{ + struct audit_buffer *ab; + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", record_num); + + if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#ho" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); +} + static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) { const struct cred *cred; @@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts struct audit_buffer *ab; struct audit_aux_data *aux; const char *tty; + struct audit_names *n; /* tsk == current */ context->pid = tsk->pid; @@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts if (context->pwd.dentry && context->pwd.mnt) { ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { - audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_d_path(ab, " cwd=", &context->pwd); audit_log_end(ab); } } - for (i = 0; i < context->name_count; i++) { - struct audit_names *n = &context->names[i]; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - continue; /* audit_panic has been called */ - - audit_log_format(ab, "item=%d", i); - - if (n->name) { - switch(n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, "name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != (unsigned long)-1) { - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - n->uid, - n->gid, - MAJOR(n->rdev), - MINOR(n->rdev)); - } - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - audit_log_fcaps(ab, n); - - audit_log_end(ab); - } + i = 0; + list_for_each_entry(n, &context->names_list, list) + audit_log_name(context, n, i++, &call_panic); /* Send end of event record to help user space know we are finished */ ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts * * Called from copy_process and do_exit */ -void audit_free(struct task_struct *tsk) +void __audit_free(struct task_struct *tsk) { struct audit_context *context; context = audit_get_context(tsk, 0, 0); - if (likely(!context)) + if (!context) return; /* Check for system calls that do not go through the exit @@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk) * will only be written if another part of the kernel requests that it * be written). */ -void audit_syscall_entry(int arch, int major, +void __audit_syscall_entry(int arch, int major, unsigned long a1, unsigned long a2, unsigned long a3, unsigned long a4) { @@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major, struct audit_context *context = tsk->audit_context; enum audit_state state; - if (unlikely(!context)) + if (!context) return; /* @@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major, context->prio = 0; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); } - if (likely(state == AUDIT_DISABLED)) + if (state == AUDIT_DISABLED) return; context->serial = 0; @@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major, context->ppid = 0; } -void audit_finish_fork(struct task_struct *child) -{ - struct audit_context *ctx = current->audit_context; - struct audit_context *p = child->audit_context; - if (!p || !ctx) - return; - if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) - return; - p->arch = ctx->arch; - p->major = ctx->major; - memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); - p->ctime = ctx->ctime; - p->dummy = ctx->dummy; - p->in_syscall = ctx->in_syscall; - p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); - p->ppid = current->pid; - p->prio = ctx->prio; - p->current_state = ctx->current_state; -} - /** * audit_syscall_exit - deallocate audit context after a system call - * @valid: success/failure flag - * @return_code: syscall return value + * @success: success value of the syscall + * @return_code: return value of the syscall * * Tear down after system call. If the audit context has been marked as * auditable (either because of the AUDIT_RECORD_CONTEXT state from - * filtering, or because some other part of the kernel write an audit + * filtering, or because some other part of the kernel wrote an audit * message), then write out the syscall information. In call cases, * free the names stored from getname(). */ -void audit_syscall_exit(int valid, long return_code) +void __audit_syscall_exit(int success, long return_code) { struct task_struct *tsk = current; struct audit_context *context; - context = audit_get_context(tsk, valid, return_code); + if (success) + success = AUDITSC_SUCCESS; + else + success = AUDITSC_FAILURE; - if (likely(!context)) + context = audit_get_context(tsk, success, return_code); + if (!context) return; if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) @@ -1821,6 +2008,30 @@ retry: #endif } +static struct audit_names *audit_alloc_name(struct audit_context *context) +{ + struct audit_names *aname; + + if (context->name_count < AUDIT_NAMES) { + aname = &context->preallocated_names[context->name_count]; + memset(aname, 0, sizeof(*aname)); + } else { + aname = kzalloc(sizeof(*aname), GFP_NOFS); + if (!aname) + return NULL; + aname->should_free = true; + } + + aname->ino = (unsigned long)-1; + list_add_tail(&aname->list, &context->names_list); + + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return aname; +} + /** * audit_getname - add a name to the list * @name: name to add @@ -1831,9 +2042,7 @@ retry: void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - - if (IS_ERR(name) || !name) - return; + struct audit_names *n; if (!context->in_syscall) { #if AUDIT_DEBUG == 2 @@ -1843,13 +2052,15 @@ void __audit_getname(const char *name) #endif return; } - BUG_ON(context->name_count >= AUDIT_NAMES); - context->names[context->name_count].name = name; - context->names[context->name_count].name_len = AUDIT_NAME_FULL; - context->names[context->name_count].name_put = 1; - context->names[context->name_count].ino = (unsigned long)-1; - context->names[context->name_count].osid = 0; - ++context->name_count; + + n = audit_alloc_name(context); + if (!n) + return; + + n->name = name; + n->name_len = AUDIT_NAME_FULL; + n->name_put = true; + if (!context->pwd.dentry) get_fs_pwd(current->fs, &context->pwd); } @@ -1871,12 +2082,13 @@ void audit_putname(const char *name) printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", __FILE__, __LINE__, context->serial, name); if (context->name_count) { + struct audit_names *n; int i; - for (i = 0; i < context->name_count; i++) + + list_for_each_entry(n, &context->names_list, list) printk(KERN_ERR "name[%d] = %p = %s\n", i, - context->names[i].name, - context->names[i].name ?: "(null)"); - } + n->name, n->name ?: "(null)"); + } #endif __putname(name); } @@ -1897,39 +2109,11 @@ void audit_putname(const char *name) #endif } -static int audit_inc_name_count(struct audit_context *context, - const struct inode *inode) -{ - if (context->name_count >= AUDIT_NAMES) { - if (inode) - printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " - "dev=%02x:%02x, inode=%lu\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), - inode->i_ino); - - else - printk(KERN_DEBUG "name_count maxed, losing inode data\n"); - return 1; - } - context->name_count++; -#if AUDIT_DEBUG - context->ino_count++; -#endif - return 0; -} - - static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) { struct cpu_vfs_cap_data caps; int rc; - memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); - memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); - name->fcap.fE = 0; - name->fcap_ver = 0; - if (!dentry) return 0; @@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent */ void __audit_inode(const char *name, const struct dentry *dentry) { - int idx; struct audit_context *context = current->audit_context; const struct inode *inode = dentry->d_inode; + struct audit_names *n; if (!context->in_syscall) return; - if (context->name_count - && context->names[context->name_count-1].name - && context->names[context->name_count-1].name == name) - idx = context->name_count - 1; - else if (context->name_count > 1 - && context->names[context->name_count-2].name - && context->names[context->name_count-2].name == name) - idx = context->name_count - 2; - else { - /* FIXME: how much do we care about inodes that have no - * associated name? */ - if (audit_inc_name_count(context, inode)) - return; - idx = context->name_count - 1; - context->names[idx].name = NULL; + + list_for_each_entry_reverse(n, &context->names_list, list) { + if (n->name && (n->name == name)) + goto out; } + + /* unable to find the name from a previous getname() */ + n = audit_alloc_name(context); + if (!n) + return; +out: handle_path(dentry); - audit_copy_inode(&context->names[idx], dentry, inode); + audit_copy_inode(n, dentry, inode); } /** @@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry) void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { - int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; const char *dname = dentry->d_name.name; + struct audit_names *n; int dirlen = 0; if (!context->in_syscall) @@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry, handle_one(inode); /* parent is more likely, look for it first */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry, } /* no matching parent, look for matching child */ - for (idx = 0; idx < context->name_count; idx++) { - struct audit_names *n = &context->names[idx]; - + list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; @@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry, add_names: if (!found_parent) { - if (audit_inc_name_count(context, parent)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; - context->names[idx].name = NULL; - audit_copy_inode(&context->names[idx], NULL, parent); + audit_copy_inode(n, NULL, parent); } if (!found_child) { - if (audit_inc_name_count(context, inode)) + n = audit_alloc_name(context); + if (!n) return; - idx = context->name_count - 1; /* Re-use the name belonging to the slot for a matching parent * directory. All names for this context are relinquished in * audit_free_names() */ if (found_parent) { - context->names[idx].name = found_parent; - context->names[idx].name_len = AUDIT_NAME_FULL; + n->name = found_parent; + n->name_len = AUDIT_NAME_FULL; /* don't call __putname() */ - context->names[idx].name_put = 0; - } else { - context->names[idx].name = NULL; + n->name_put = false; } if (inode) - audit_copy_inode(&context->names[idx], NULL, inode); - else - context->names[idx].ino = (unsigned long)-1; + audit_copy_inode(n, NULL, inode); } } EXPORT_SYMBOL_GPL(__audit_inode_child); @@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx, static atomic_t session_id = ATOMIC_INIT(0); /** - * audit_set_loginuid - set a task's audit_context loginuid - * @task: task whose audit context is being modified + * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value * * Returns 0. * * Called (set) from fs/proc/base.c::proc_loginuid_write(). */ -int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +int audit_set_loginuid(uid_t loginuid) { - unsigned int sessionid = atomic_inc_return(&session_id); + struct task_struct *task = current; struct audit_context *context = task->audit_context; + unsigned int sessionid; + +#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE + if (task->loginuid != -1) + return -EPERM; +#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; +#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + sessionid = atomic_inc_return(&session_id); if (context && context->in_syscall) { struct audit_buffer *ab; @@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int audit_bprm(struct linux_binprm *bprm) +int __audit_bprm(struct linux_binprm *bprm) { struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - if (likely(!audit_enabled || !context || context->dummy)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); if (!ax) return -ENOMEM; @@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm) * @args: args array * */ -void audit_socketcall(int nargs, unsigned long *args) +void __audit_socketcall(int nargs, unsigned long *args) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return; - context->type = AUDIT_SOCKETCALL; context->socketcall.nargs = nargs; memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); @@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_sockaddr(int len, void *a) +int __audit_sockaddr(int len, void *a) { struct audit_context *context = current->audit_context; - if (likely(!context || context->dummy)) - return 0; - if (!context->sockaddr) { void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); if (!p) @@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags) context->type = AUDIT_MMAP; } +static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) +{ + uid_t auid, uid; + gid_t gid; + unsigned int sessionid; + + auid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + current_uid_gid(&uid, &gid); + + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + audit_log_task_context(ab); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " reason="); + audit_log_string(ab, reason); + audit_log_format(ab, " sig=%ld", signr); +} /** * audit_core_dumps - record information about processes that end abnormally * @signr: signal value @@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags) void audit_core_dumps(long signr) { struct audit_buffer *ab; - u32 sid; - uid_t auid = audit_get_loginuid(current), uid; - gid_t gid; - unsigned int sessionid = audit_get_sessionid(current); if (!audit_enabled) return; @@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr) return; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); - current_uid_gid(&uid, &gid); - audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", - auid, uid, gid, sessionid); - security_task_getsecid(current, &sid); - if (sid) { - char *ctx = NULL; - u32 len; + audit_log_abend(ab, "memory violation", signr); + audit_log_end(ab); +} - if (security_secid_to_secctx(sid, &ctx, &len)) - audit_log_format(ab, " ssid=%u", sid); - else { - audit_log_format(ab, " subj=%s", ctx); - security_release_secctx(ctx, len); - } - } - audit_log_format(ab, " pid=%d comm=", current->pid); - audit_log_untrustedstring(ab, current->comm); - audit_log_format(ab, " sig=%ld", signr); +void __audit_seccomp(unsigned long syscall) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + audit_log_abend(ab, "seccomp", SIGKILL); + audit_log_format(ab, " syscall=%ld", syscall); audit_log_end(ab); } diff --git a/kernel/capability.c b/kernel/capability.c index 0fcf1c14a29..3f1adb6c647 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap) BUG(); } - if (has_ns_capability(current, ns, cap)) { + if (security_capable(current_cred(), ns, cap) == 0) { current->flags |= PF_SUPERPRIV; return true; } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 057e24b665c..6581a040f39 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -115,8 +115,6 @@ int get_callchain_buffers(void) } err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); exit: mutex_unlock(&callchain_mutex); diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a0..1b5c081d8b9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event) * here. */ if (is_cgroup_event(event)) - run_end = perf_event_time(event); + run_end = perf_cgroup_event_time(event); else if (ctx->is_active) run_end = ctx->time; else @@ -2300,7 +2300,10 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static DEFINE_PER_CPU(int, perf_throttled_count); +static DEFINE_PER_CPU(u64, perf_throttled_seq); + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2319,22 +2322,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +/* + * combine freq adjustment with unthrottling to avoid two passes over the + * events. At the same time, make sure, having freq events does not change + * the rate of unthrottling as that would introduce bias. + */ +static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, + int needs_unthr) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, now; + u64 now, period = TICK_NSEC; s64 delta; - if (!ctx->nr_freq) + /* + * only need to iterate over all events iff: + * - context have events in frequency mode (needs freq adjust) + * - there are events to unthrottle on this cpu + */ + if (!(ctx->nr_freq || needs_unthr)) return; + raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2344,13 +2365,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) hwc = &event->hw; - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle events on the tick - */ - if (interrupts == MAX_INTERRUPTS) { + if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); } @@ -2358,14 +2374,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (!event->attr.freq || !event->attr.sample_freq) continue; - event->pmu->read(event); + /* + * stop the event and update event->count + */ + event->pmu->stop(event, PERF_EF_UPDATE); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; + /* + * restart the event + * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. + */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); + + event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + + perf_pmu_enable(ctx->pmu); + raw_spin_unlock(&ctx->lock); } /* @@ -2388,16 +2420,13 @@ static void rotate_ctx(struct perf_event_context *ctx) */ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1, freq = 0; + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; - if (cpuctx->ctx.nr_freq) - freq = 1; } ctx = cpuctx->task_ctx; @@ -2405,37 +2434,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; - if (ctx->nr_freq) - freq = 1; } - if (!rotate && !freq) + if (!rotate) goto done; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (freq) { - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - } - - if (rotate) { - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); - } + perf_event_sched_in(cpuctx, ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - done: if (remove) list_del_init(&cpuctx->rotation_list); @@ -2445,10 +2463,22 @@ void perf_event_task_tick(void) { struct list_head *head = &__get_cpu_var(rotation_list); struct perf_cpu_context *cpuctx, *tmp; + struct perf_event_context *ctx; + int throttled; WARN_ON(!irqs_disabled()); + __this_cpu_inc(perf_throttled_seq); + throttled = __this_cpu_xchg(perf_throttled_count, 0); + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + ctx = &cpuctx->ctx; + perf_adjust_freq_unthr_context(ctx, throttled); + + ctx = cpuctx->task_ctx; + if (ctx) + perf_adjust_freq_unthr_context(ctx, throttled); + if (cpuctx->jiffies_interval == 1 || !(jiffies % cpuctx->jiffies_interval)) perf_rotate_context(cpuctx); @@ -4509,6 +4539,7 @@ static int __perf_event_overflow(struct perf_event *event, { int events = atomic_read(&event->event_limit); struct hw_perf_event *hwc = &event->hw; + u64 seq; int ret = 0; /* @@ -4518,14 +4549,20 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(!is_sampling_event(event))) return 0; - if (unlikely(hwc->interrupts >= max_samples_per_tick)) { - if (throttle) { + seq = __this_cpu_read(perf_throttled_seq); + if (seq != hwc->interrupts_seq) { + hwc->interrupts_seq = seq; + hwc->interrupts = 1; + } else { + hwc->interrupts++; + if (unlikely(throttle + && hwc->interrupts >= max_samples_per_tick)) { + __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); ret = 1; } - } else - hwc->interrupts++; + } if (event->attr.freq) { u64 now = perf_clock(); @@ -4534,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* diff --git a/kernel/exit.c b/kernel/exit.c index c44738267be..4b4042f9bc6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -964,8 +964,7 @@ void do_exit(long code) acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); - if (unlikely(tsk->audit_context)) - audit_free(tsk); + audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); @@ -1039,6 +1038,22 @@ void do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ diff --git a/kernel/fork.c b/kernel/fork.c index f3fa18887cc..b77fd559c78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -647,6 +647,26 @@ struct mm_struct *get_task_mm(struct task_struct *task) } EXPORT_SYMBOL_GPL(get_task_mm); +struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) +{ + struct mm_struct *mm; + int err; + + err = mutex_lock_killable(&task->signal->cred_guard_mutex); + if (err) + return ERR_PTR(err); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, mode)) { + mmput(mm); + mm = ERR_PTR(-EACCES); + } + mutex_unlock(&task->signal->cred_guard_mutex); + + return mm; +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. @@ -890,7 +910,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; @@ -1527,8 +1547,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - audit_finish_fork(p); - /* * We set PF_STARTING at creation in case tracing wants to * use this to distinguish a fully live task from one that diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 95dd7212e61..9788c0ec6f4 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) /* Early boot. kretprobe_table_locks not yet initialized. */ return; + INIT_HLIST_HEAD(&empty_rp); hash = hash_ptr(tk, KPROBE_HASH_BITS); head = &kretprobe_inst_table[hash]; kretprobe_table_lock(hash, &flags); @@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) recycle_rp_inst(ri, &empty_rp); } kretprobe_table_unlock(hash, &flags); - INIT_HLIST_HEAD(&empty_rp); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); diff --git a/kernel/params.c b/kernel/params.c index 32ee0430828..4bc965d8a1f 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deacc..9f08dfabaf1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b..21724eee520 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index 77274c9ba2f..7e426459e60 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } @@ -188,3 +191,22 @@ void thaw_processes(void) printk("done.\n"); } +void thaw_kernel_threads(void) +{ + struct task_struct *g, *p; + + pm_nosig_freezing = false; + printk("Restarting kernel threads ... "); + + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + __thaw_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + schedule(); + printk("done.\n"); +} diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1cf88900ec4..6a768e53700 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone) unsigned int res; res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), + LINKED_PAGE_DATA_SIZE); return 2 * res; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 6b1ab7a8852..3e100075b13 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; @@ -274,6 +276,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = 0; + /* + * It is necessary to thaw kernel threads here, because + * SNAPSHOT_CREATE_IMAGE may be invoked directly after + * SNAPSHOT_FREE. In that case, if kernel threads were not + * thawed, the preallocation of memory carried out by + * hibernation_snapshot() might run into problems (i.e. it + * might fail or even deadlock). + */ + thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88f17b8a3b1..a58ac285fc6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ static int nfakewriters = 4; /* # fake writer threads */ static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ -static int verbose; /* Print more debug info. */ -static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static bool verbose; /* Print more debug info. */ +static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ @@ -1399,7 +1399,7 @@ rcu_torture_shutdown(void *arg) * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ -static int +static int __cpuinit rcu_torture_onoff(void *arg) { int cpu; @@ -1447,7 +1447,7 @@ rcu_torture_onoff(void *arg) return 0; } -static int +static int __cpuinit rcu_torture_onoff_init(void) { if (onoff_interval <= 0) diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2..ab56a1764d4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 6d269cce7aa..d508363858b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -66,6 +66,31 @@ done: return ret; } +int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + int ret, r; + unsigned long flags; + struct res_counter *c; + + r = ret = 0; + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + r = res_counter_charge_locked(c, val); + if (r) + c->usage += val; + spin_unlock(&c->lock); + if (r < 0 && ret == 0) { + *limit_fail_at = c; + ret = r; + } + } + local_irq_restore(flags); + + return ret; +} void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) { if (WARN_ON(counter->usage < val)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263..5255c9d2e05 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> +#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif @@ -723,9 +724,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) p->sched_class->dequeue_task(rq, p, flags); } -/* - * activate_task - move a task to the runqueue. - */ void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -734,9 +732,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) enqueue_task(rq, p, flags); } -/* - * deactivate_task - remove a task from the runqueue. - */ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) @@ -4134,7 +4129,7 @@ recheck: on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); if (running) p->sched_class->put_prev_task(rq, p); @@ -4147,7 +4142,7 @@ recheck: if (running) p->sched_class->set_curr_task(rq); if (on_rq) - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, p, &flags); @@ -4998,9 +4993,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * placed properly. */ if (p->on_rq) { - deactivate_task(rq_src, p, 0); + dequeue_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); - activate_task(rq_dest, p, 0); + enqueue_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } done: @@ -7032,10 +7027,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p) on_rq = p->on_rq; if (on_rq) - deactivate_task(rq, p, 0); + dequeue_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(rq, p, 0); + enqueue_task(rq, p, 0); resched_task(rq->curr); } diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index b0d798eaf13..d72586fdf66 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the cpu priority setting * @cp: The cpupri context * @cpu: The target cpu - * @pri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID-RT99) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * @@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /** * cpupri_init - initialize the cpupri structure * @cp: The cpupri context - * @bootmem: true if allocations need to use bootmem * * Returns: -ENOMEM if memory fails. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 84adb2d66cb..7c6414fc669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4866,6 +4866,15 @@ static void nohz_balancer_kick(int cpu) return; } +static inline void clear_nohz_tick_stopped(int cpu) +{ + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + } +} + static inline void set_cpu_sd_state_busy(void) { struct sched_domain *sd; @@ -4904,6 +4913,12 @@ void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); + /* + * If this cpu is going down, then nothing needs to be done. + */ + if (!cpu_active(cpu)) + return; + if (stop_tick) { if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; @@ -4914,6 +4929,18 @@ void select_nohz_load_balancer(int stop_tick) } return; } + +static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DYING: + clear_nohz_tick_stopped(smp_processor_id()); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} #endif static DEFINE_SPINLOCK(balancing); @@ -5070,11 +5097,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) * busy tick after returning from idle, we will update the busy stats. */ set_cpu_sd_state_busy(); - if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { - clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + clear_nohz_tick_stopped(cpu); /* * None are in tickless mode and hence no need for NOHZ idle load @@ -5590,6 +5613,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_NO_HZ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3640ebbb466..f42ae7fb5ec 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,6 +1587,11 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + if (unlikely(task_running(rq, next_task))) + return 0; +#endif + retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13b631..e8d76c5895e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -6,6 +6,7 @@ * This defines a simple but solid secure-computing mode. */ +#include <linux/audit.h> #include <linux/seccomp.h> #include <linux/sched.h> #include <linux/compat.h> @@ -54,6 +55,7 @@ void __secure_computing(int this_syscall) #ifdef SECCOMP_DEBUG dump_stack(); #endif + audit_seccomp(this_syscall); do_exit(SIGKILL); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index db110b8ae03..f1539decd99 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod) int ret = 0; /* - * We skip modules that tain the kernel, especially those with different - * module header (for forced load), to make sure we don't cause a crash. + * We skip modules that taint the kernel, especially those with different + * module headers (for forced load), to make sure we don't cause a crash. + * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints) + if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1d7bca7f4f5..d117262deba 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); |